AArch64ISelLowering.cpp 927 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048704970507051705270537054705570567057705870597060706170627063706470657066706770687069707070717072707370747075707670777078707970807081708270837084708570867087708870897090709170927093709470957096709770987099710071017102710371047105710671077108710971107111711271137114711571167117711871197120712171227123712471257126712771287129713071317132713371347135713671377138713971407141714271437144714571467147714871497150715171527153715471557156715771587159716071617162716371647165716671677168716971707171717271737174717571767177717871797180718171827183718471857186718771887189719071917192719371947195719671977198719972007201720272037204720572067207720872097210721172127213721472157216721772187219722072217222722372247225722672277228722972307231723272337234723572367237723872397240724172427243724472457246724772487249725072517252725372547255725672577258725972607261726272637264726572667267726872697270727172727273727472757276727772787279728072817282728372847285728672877288728972907291729272937294729572967297729872997300730173027303730473057306730773087309731073117312731373147315731673177318731973207321732273237324732573267327732873297330733173327333733473357336733773387339734073417342734373447345734673477348734973507351735273537354735573567357735873597360736173627363736473657366736773687369737073717372737373747375737673777378737973807381738273837384738573867387738873897390739173927393739473957396739773987399740074017402740374047405740674077408740974107411741274137414741574167417741874197420742174227423742474257426742774287429743074317432743374347435743674377438743974407441744274437444744574467447744874497450745174527453745474557456745774587459746074617462746374647465746674677468746974707471747274737474747574767477747874797480748174827483748474857486748774887489749074917492749374947495749674977498749975007501750275037504750575067507750875097510751175127513751475157516751775187519752075217522752375247525752675277528752975307531753275337534753575367537753875397540754175427543754475457546754775487549755075517552755375547555755675577558755975607561756275637564756575667567756875697570757175727573757475757576757775787579758075817582758375847585758675877588758975907591759275937594759575967597759875997600760176027603760476057606760776087609761076117612761376147615761676177618761976207621762276237624762576267627762876297630763176327633763476357636763776387639764076417642764376447645764676477648764976507651765276537654765576567657765876597660766176627663766476657666766776687669767076717672767376747675767676777678767976807681768276837684768576867687768876897690769176927693769476957696769776987699770077017702770377047705770677077708770977107711771277137714771577167717771877197720772177227723772477257726772777287729773077317732773377347735773677377738773977407741774277437744774577467747774877497750775177527753775477557756775777587759776077617762776377647765776677677768776977707771777277737774777577767777777877797780778177827783778477857786778777887789779077917792779377947795779677977798779978007801780278037804780578067807780878097810781178127813781478157816781778187819782078217822782378247825782678277828782978307831783278337834783578367837783878397840784178427843784478457846784778487849785078517852785378547855785678577858785978607861786278637864786578667867786878697870787178727873787478757876787778787879788078817882788378847885788678877888788978907891789278937894789578967897789878997900790179027903790479057906790779087909791079117912791379147915791679177918791979207921792279237924792579267927792879297930793179327933793479357936793779387939794079417942794379447945794679477948794979507951795279537954795579567957795879597960796179627963796479657966796779687969797079717972797379747975797679777978797979807981798279837984798579867987798879897990799179927993799479957996799779987999800080018002800380048005800680078008800980108011801280138014801580168017801880198020802180228023802480258026802780288029803080318032803380348035803680378038803980408041804280438044804580468047804880498050805180528053805480558056805780588059806080618062806380648065806680678068806980708071807280738074807580768077807880798080808180828083808480858086808780888089809080918092809380948095809680978098809981008101810281038104810581068107810881098110811181128113811481158116811781188119812081218122812381248125812681278128812981308131813281338134813581368137813881398140814181428143814481458146814781488149815081518152815381548155815681578158815981608161816281638164816581668167816881698170817181728173817481758176817781788179818081818182818381848185818681878188818981908191819281938194819581968197819881998200820182028203820482058206820782088209821082118212821382148215821682178218821982208221822282238224822582268227822882298230823182328233823482358236823782388239824082418242824382448245824682478248824982508251825282538254825582568257825882598260826182628263826482658266826782688269827082718272827382748275827682778278827982808281828282838284828582868287828882898290829182928293829482958296829782988299830083018302830383048305830683078308830983108311831283138314831583168317831883198320832183228323832483258326832783288329833083318332833383348335833683378338833983408341834283438344834583468347834883498350835183528353835483558356835783588359836083618362836383648365836683678368836983708371837283738374837583768377837883798380838183828383838483858386838783888389839083918392839383948395839683978398839984008401840284038404840584068407840884098410841184128413841484158416841784188419842084218422842384248425842684278428842984308431843284338434843584368437843884398440844184428443844484458446844784488449845084518452845384548455845684578458845984608461846284638464846584668467846884698470847184728473847484758476847784788479848084818482848384848485848684878488848984908491849284938494849584968497849884998500850185028503850485058506850785088509851085118512851385148515851685178518851985208521852285238524852585268527852885298530853185328533853485358536853785388539854085418542854385448545854685478548854985508551855285538554855585568557855885598560856185628563856485658566856785688569857085718572857385748575857685778578857985808581858285838584858585868587858885898590859185928593859485958596859785988599860086018602860386048605860686078608860986108611861286138614861586168617861886198620862186228623862486258626862786288629863086318632863386348635863686378638863986408641864286438644864586468647864886498650865186528653865486558656865786588659866086618662866386648665866686678668866986708671867286738674867586768677867886798680868186828683868486858686868786888689869086918692869386948695869686978698869987008701870287038704870587068707870887098710871187128713871487158716871787188719872087218722872387248725872687278728872987308731873287338734873587368737873887398740874187428743874487458746874787488749875087518752875387548755875687578758875987608761876287638764876587668767876887698770877187728773877487758776877787788779878087818782878387848785878687878788878987908791879287938794879587968797879887998800880188028803880488058806880788088809881088118812881388148815881688178818881988208821882288238824882588268827882888298830883188328833883488358836883788388839884088418842884388448845884688478848884988508851885288538854885588568857885888598860886188628863886488658866886788688869887088718872887388748875887688778878887988808881888288838884888588868887888888898890889188928893889488958896889788988899890089018902890389048905890689078908890989108911891289138914891589168917891889198920892189228923892489258926892789288929893089318932893389348935893689378938893989408941894289438944894589468947894889498950895189528953895489558956895789588959896089618962896389648965896689678968896989708971897289738974897589768977897889798980898189828983898489858986898789888989899089918992899389948995899689978998899990009001900290039004900590069007900890099010901190129013901490159016901790189019902090219022902390249025902690279028902990309031903290339034903590369037903890399040904190429043904490459046904790489049905090519052905390549055905690579058905990609061906290639064906590669067906890699070907190729073907490759076907790789079908090819082908390849085908690879088908990909091909290939094909590969097909890999100910191029103910491059106910791089109911091119112911391149115911691179118911991209121912291239124912591269127912891299130913191329133913491359136913791389139914091419142914391449145914691479148914991509151915291539154915591569157915891599160916191629163916491659166916791689169917091719172917391749175917691779178917991809181918291839184918591869187918891899190919191929193919491959196919791989199920092019202920392049205920692079208920992109211921292139214921592169217921892199220922192229223922492259226922792289229923092319232923392349235923692379238923992409241924292439244924592469247924892499250925192529253925492559256925792589259926092619262926392649265926692679268926992709271927292739274927592769277927892799280928192829283928492859286928792889289929092919292929392949295929692979298929993009301930293039304930593069307930893099310931193129313931493159316931793189319932093219322932393249325932693279328932993309331933293339334933593369337933893399340934193429343934493459346934793489349935093519352935393549355935693579358935993609361936293639364936593669367936893699370937193729373937493759376937793789379938093819382938393849385938693879388938993909391939293939394939593969397939893999400940194029403940494059406940794089409941094119412941394149415941694179418941994209421942294239424942594269427942894299430943194329433943494359436943794389439944094419442944394449445944694479448944994509451945294539454945594569457945894599460946194629463946494659466946794689469947094719472947394749475947694779478947994809481948294839484948594869487948894899490949194929493949494959496949794989499950095019502950395049505950695079508950995109511951295139514951595169517951895199520952195229523952495259526952795289529953095319532953395349535953695379538953995409541954295439544954595469547954895499550955195529553955495559556955795589559956095619562956395649565956695679568956995709571957295739574957595769577957895799580958195829583958495859586958795889589959095919592959395949595959695979598959996009601960296039604960596069607960896099610961196129613961496159616961796189619962096219622962396249625962696279628962996309631963296339634963596369637963896399640964196429643964496459646964796489649965096519652965396549655965696579658965996609661966296639664966596669667966896699670967196729673967496759676967796789679968096819682968396849685968696879688968996909691969296939694969596969697969896999700970197029703970497059706970797089709971097119712971397149715971697179718971997209721972297239724972597269727972897299730973197329733973497359736973797389739974097419742974397449745974697479748974997509751975297539754975597569757975897599760976197629763976497659766976797689769977097719772977397749775977697779778977997809781978297839784978597869787978897899790979197929793979497959796979797989799980098019802980398049805980698079808980998109811981298139814981598169817981898199820982198229823982498259826982798289829983098319832983398349835983698379838983998409841984298439844984598469847984898499850985198529853985498559856985798589859986098619862986398649865986698679868986998709871987298739874987598769877987898799880988198829883988498859886988798889889989098919892989398949895989698979898989999009901990299039904990599069907990899099910991199129913991499159916991799189919992099219922992399249925992699279928992999309931993299339934993599369937993899399940994199429943994499459946994799489949995099519952995399549955995699579958995999609961996299639964996599669967996899699970997199729973997499759976997799789979998099819982998399849985998699879988998999909991999299939994999599969997999899991000010001100021000310004100051000610007100081000910010100111001210013100141001510016100171001810019100201002110022100231002410025100261002710028100291003010031100321003310034100351003610037100381003910040100411004210043100441004510046100471004810049100501005110052100531005410055100561005710058100591006010061100621006310064100651006610067100681006910070100711007210073100741007510076100771007810079100801008110082100831008410085100861008710088100891009010091100921009310094100951009610097100981009910100101011010210103101041010510106101071010810109101101011110112101131011410115101161011710118101191012010121101221012310124101251012610127101281012910130101311013210133101341013510136101371013810139101401014110142101431014410145101461014710148101491015010151101521015310154101551015610157101581015910160101611016210163101641016510166101671016810169101701017110172101731017410175101761017710178101791018010181101821018310184101851018610187101881018910190101911019210193101941019510196101971019810199102001020110202102031020410205102061020710208102091021010211102121021310214102151021610217102181021910220102211022210223102241022510226102271022810229102301023110232102331023410235102361023710238102391024010241102421024310244102451024610247102481024910250102511025210253102541025510256102571025810259102601026110262102631026410265102661026710268102691027010271102721027310274102751027610277102781027910280102811028210283102841028510286102871028810289102901029110292102931029410295102961029710298102991030010301103021030310304103051030610307103081030910310103111031210313103141031510316103171031810319103201032110322103231032410325103261032710328103291033010331103321033310334103351033610337103381033910340103411034210343103441034510346103471034810349103501035110352103531035410355103561035710358103591036010361103621036310364103651036610367103681036910370103711037210373103741037510376103771037810379103801038110382103831038410385103861038710388103891039010391103921039310394103951039610397103981039910400104011040210403104041040510406104071040810409104101041110412104131041410415104161041710418104191042010421104221042310424104251042610427104281042910430104311043210433104341043510436104371043810439104401044110442104431044410445104461044710448104491045010451104521045310454104551045610457104581045910460104611046210463104641046510466104671046810469104701047110472104731047410475104761047710478104791048010481104821048310484104851048610487104881048910490104911049210493104941049510496104971049810499105001050110502105031050410505105061050710508105091051010511105121051310514105151051610517105181051910520105211052210523105241052510526105271052810529105301053110532105331053410535105361053710538105391054010541105421054310544105451054610547105481054910550105511055210553105541055510556105571055810559105601056110562105631056410565105661056710568105691057010571105721057310574105751057610577105781057910580105811058210583105841058510586105871058810589105901059110592105931059410595105961059710598105991060010601106021060310604106051060610607106081060910610106111061210613106141061510616106171061810619106201062110622106231062410625106261062710628106291063010631106321063310634106351063610637106381063910640106411064210643106441064510646106471064810649106501065110652106531065410655106561065710658106591066010661106621066310664106651066610667106681066910670106711067210673106741067510676106771067810679106801068110682106831068410685106861068710688106891069010691106921069310694106951069610697106981069910700107011070210703107041070510706107071070810709107101071110712107131071410715107161071710718107191072010721107221072310724107251072610727107281072910730107311073210733107341073510736107371073810739107401074110742107431074410745107461074710748107491075010751107521075310754107551075610757107581075910760107611076210763107641076510766107671076810769107701077110772107731077410775107761077710778107791078010781107821078310784107851078610787107881078910790107911079210793107941079510796107971079810799108001080110802108031080410805108061080710808108091081010811108121081310814108151081610817108181081910820108211082210823108241082510826108271082810829108301083110832108331083410835108361083710838108391084010841108421084310844108451084610847108481084910850108511085210853108541085510856108571085810859108601086110862108631086410865108661086710868108691087010871108721087310874108751087610877108781087910880108811088210883108841088510886108871088810889108901089110892108931089410895108961089710898108991090010901109021090310904109051090610907109081090910910109111091210913109141091510916109171091810919109201092110922109231092410925109261092710928109291093010931109321093310934109351093610937109381093910940109411094210943109441094510946109471094810949109501095110952109531095410955109561095710958109591096010961109621096310964109651096610967109681096910970109711097210973109741097510976109771097810979109801098110982109831098410985109861098710988109891099010991109921099310994109951099610997109981099911000110011100211003110041100511006110071100811009110101101111012110131101411015110161101711018110191102011021110221102311024110251102611027110281102911030110311103211033110341103511036110371103811039110401104111042110431104411045110461104711048110491105011051110521105311054110551105611057110581105911060110611106211063110641106511066110671106811069110701107111072110731107411075110761107711078110791108011081110821108311084110851108611087110881108911090110911109211093110941109511096110971109811099111001110111102111031110411105111061110711108111091111011111111121111311114111151111611117111181111911120111211112211123111241112511126111271112811129111301113111132111331113411135111361113711138111391114011141111421114311144111451114611147111481114911150111511115211153111541115511156111571115811159111601116111162111631116411165111661116711168111691117011171111721117311174111751117611177111781117911180111811118211183111841118511186111871118811189111901119111192111931119411195111961119711198111991120011201112021120311204112051120611207112081120911210112111121211213112141121511216112171121811219112201122111222112231122411225112261122711228112291123011231112321123311234112351123611237112381123911240112411124211243112441124511246112471124811249112501125111252112531125411255112561125711258112591126011261112621126311264112651126611267112681126911270112711127211273112741127511276112771127811279112801128111282112831128411285112861128711288112891129011291112921129311294112951129611297112981129911300113011130211303113041130511306113071130811309113101131111312113131131411315113161131711318113191132011321113221132311324113251132611327113281132911330113311133211333113341133511336113371133811339113401134111342113431134411345113461134711348113491135011351113521135311354113551135611357113581135911360113611136211363113641136511366113671136811369113701137111372113731137411375113761137711378113791138011381113821138311384113851138611387113881138911390113911139211393113941139511396113971139811399114001140111402114031140411405114061140711408114091141011411114121141311414114151141611417114181141911420114211142211423114241142511426114271142811429114301143111432114331143411435114361143711438114391144011441114421144311444114451144611447114481144911450114511145211453114541145511456114571145811459114601146111462114631146411465114661146711468114691147011471114721147311474114751147611477114781147911480114811148211483114841148511486114871148811489114901149111492114931149411495114961149711498114991150011501115021150311504115051150611507115081150911510115111151211513115141151511516115171151811519115201152111522115231152411525115261152711528115291153011531115321153311534115351153611537115381153911540115411154211543115441154511546115471154811549115501155111552115531155411555115561155711558115591156011561115621156311564115651156611567115681156911570115711157211573115741157511576115771157811579115801158111582115831158411585115861158711588115891159011591115921159311594115951159611597115981159911600116011160211603116041160511606116071160811609116101161111612116131161411615116161161711618116191162011621116221162311624116251162611627116281162911630116311163211633116341163511636116371163811639116401164111642116431164411645116461164711648116491165011651116521165311654116551165611657116581165911660116611166211663116641166511666116671166811669116701167111672116731167411675116761167711678116791168011681116821168311684116851168611687116881168911690116911169211693116941169511696116971169811699117001170111702117031170411705117061170711708117091171011711117121171311714117151171611717117181171911720117211172211723117241172511726117271172811729117301173111732117331173411735117361173711738117391174011741117421174311744117451174611747117481174911750117511175211753117541175511756117571175811759117601176111762117631176411765117661176711768117691177011771117721177311774117751177611777117781177911780117811178211783117841178511786117871178811789117901179111792117931179411795117961179711798117991180011801118021180311804118051180611807118081180911810118111181211813118141181511816118171181811819118201182111822118231182411825118261182711828118291183011831118321183311834118351183611837118381183911840118411184211843118441184511846118471184811849118501185111852118531185411855118561185711858118591186011861118621186311864118651186611867118681186911870118711187211873118741187511876118771187811879118801188111882118831188411885118861188711888118891189011891118921189311894118951189611897118981189911900119011190211903119041190511906119071190811909119101191111912119131191411915119161191711918119191192011921119221192311924119251192611927119281192911930119311193211933119341193511936119371193811939119401194111942119431194411945119461194711948119491195011951119521195311954119551195611957119581195911960119611196211963119641196511966119671196811969119701197111972119731197411975119761197711978119791198011981119821198311984119851198611987119881198911990119911199211993119941199511996119971199811999120001200112002120031200412005120061200712008120091201012011120121201312014120151201612017120181201912020120211202212023120241202512026120271202812029120301203112032120331203412035120361203712038120391204012041120421204312044120451204612047120481204912050120511205212053120541205512056120571205812059120601206112062120631206412065120661206712068120691207012071120721207312074120751207612077120781207912080120811208212083120841208512086120871208812089120901209112092120931209412095120961209712098120991210012101121021210312104121051210612107121081210912110121111211212113121141211512116121171211812119121201212112122121231212412125121261212712128121291213012131121321213312134121351213612137121381213912140121411214212143121441214512146121471214812149121501215112152121531215412155121561215712158121591216012161121621216312164121651216612167121681216912170121711217212173121741217512176121771217812179121801218112182121831218412185121861218712188121891219012191121921219312194121951219612197121981219912200122011220212203122041220512206122071220812209122101221112212122131221412215122161221712218122191222012221122221222312224122251222612227122281222912230122311223212233122341223512236122371223812239122401224112242122431224412245122461224712248122491225012251122521225312254122551225612257122581225912260122611226212263122641226512266122671226812269122701227112272122731227412275122761227712278122791228012281122821228312284122851228612287122881228912290122911229212293122941229512296122971229812299123001230112302123031230412305123061230712308123091231012311123121231312314123151231612317123181231912320123211232212323123241232512326123271232812329123301233112332123331233412335123361233712338123391234012341123421234312344123451234612347123481234912350123511235212353123541235512356123571235812359123601236112362123631236412365123661236712368123691237012371123721237312374123751237612377123781237912380123811238212383123841238512386123871238812389123901239112392123931239412395123961239712398123991240012401124021240312404124051240612407124081240912410124111241212413124141241512416124171241812419124201242112422124231242412425124261242712428124291243012431124321243312434124351243612437124381243912440124411244212443124441244512446124471244812449124501245112452124531245412455124561245712458124591246012461124621246312464124651246612467124681246912470124711247212473124741247512476124771247812479124801248112482124831248412485124861248712488124891249012491124921249312494124951249612497124981249912500125011250212503125041250512506125071250812509125101251112512125131251412515125161251712518125191252012521125221252312524125251252612527125281252912530125311253212533125341253512536125371253812539125401254112542125431254412545125461254712548125491255012551125521255312554125551255612557125581255912560125611256212563125641256512566125671256812569125701257112572125731257412575125761257712578125791258012581125821258312584125851258612587125881258912590125911259212593125941259512596125971259812599126001260112602126031260412605126061260712608126091261012611126121261312614126151261612617126181261912620126211262212623126241262512626126271262812629126301263112632126331263412635126361263712638126391264012641126421264312644126451264612647126481264912650126511265212653126541265512656126571265812659126601266112662126631266412665126661266712668126691267012671126721267312674126751267612677126781267912680126811268212683126841268512686126871268812689126901269112692126931269412695126961269712698126991270012701127021270312704127051270612707127081270912710127111271212713127141271512716127171271812719127201272112722127231272412725127261272712728127291273012731127321273312734127351273612737127381273912740127411274212743127441274512746127471274812749127501275112752127531275412755127561275712758127591276012761127621276312764127651276612767127681276912770127711277212773127741277512776127771277812779127801278112782127831278412785127861278712788127891279012791127921279312794127951279612797127981279912800128011280212803128041280512806128071280812809128101281112812128131281412815128161281712818128191282012821128221282312824128251282612827128281282912830128311283212833128341283512836128371283812839128401284112842128431284412845128461284712848128491285012851128521285312854128551285612857128581285912860128611286212863128641286512866128671286812869128701287112872128731287412875128761287712878128791288012881128821288312884128851288612887128881288912890128911289212893128941289512896128971289812899129001290112902129031290412905129061290712908129091291012911129121291312914129151291612917129181291912920129211292212923129241292512926129271292812929129301293112932129331293412935129361293712938129391294012941129421294312944129451294612947129481294912950129511295212953129541295512956129571295812959129601296112962129631296412965129661296712968129691297012971129721297312974129751297612977129781297912980129811298212983129841298512986129871298812989129901299112992129931299412995129961299712998129991300013001130021300313004130051300613007130081300913010130111301213013130141301513016130171301813019130201302113022130231302413025130261302713028130291303013031130321303313034130351303613037130381303913040130411304213043130441304513046130471304813049130501305113052130531305413055130561305713058130591306013061130621306313064130651306613067130681306913070130711307213073130741307513076130771307813079130801308113082130831308413085130861308713088130891309013091130921309313094130951309613097130981309913100131011310213103131041310513106131071310813109131101311113112131131311413115131161311713118131191312013121131221312313124131251312613127131281312913130131311313213133131341313513136131371313813139131401314113142131431314413145131461314713148131491315013151131521315313154131551315613157131581315913160131611316213163131641316513166131671316813169131701317113172131731317413175131761317713178131791318013181131821318313184131851318613187131881318913190131911319213193131941319513196131971319813199132001320113202132031320413205132061320713208132091321013211132121321313214132151321613217132181321913220132211322213223132241322513226132271322813229132301323113232132331323413235132361323713238132391324013241132421324313244132451324613247132481324913250132511325213253132541325513256132571325813259132601326113262132631326413265132661326713268132691327013271132721327313274132751327613277132781327913280132811328213283132841328513286132871328813289132901329113292132931329413295132961329713298132991330013301133021330313304133051330613307133081330913310133111331213313133141331513316133171331813319133201332113322133231332413325133261332713328133291333013331133321333313334133351333613337133381333913340133411334213343133441334513346133471334813349133501335113352133531335413355133561335713358133591336013361133621336313364133651336613367133681336913370133711337213373133741337513376133771337813379133801338113382133831338413385133861338713388133891339013391133921339313394133951339613397133981339913400134011340213403134041340513406134071340813409134101341113412134131341413415134161341713418134191342013421134221342313424134251342613427134281342913430134311343213433134341343513436134371343813439134401344113442134431344413445134461344713448134491345013451134521345313454134551345613457134581345913460134611346213463134641346513466134671346813469134701347113472134731347413475134761347713478134791348013481134821348313484134851348613487134881348913490134911349213493134941349513496134971349813499135001350113502135031350413505135061350713508135091351013511135121351313514135151351613517135181351913520135211352213523135241352513526135271352813529135301353113532135331353413535135361353713538135391354013541135421354313544135451354613547135481354913550135511355213553135541355513556135571355813559135601356113562135631356413565135661356713568135691357013571135721357313574135751357613577135781357913580135811358213583135841358513586135871358813589135901359113592135931359413595135961359713598135991360013601136021360313604136051360613607136081360913610136111361213613136141361513616136171361813619136201362113622136231362413625136261362713628136291363013631136321363313634136351363613637136381363913640136411364213643136441364513646136471364813649136501365113652136531365413655136561365713658136591366013661136621366313664136651366613667136681366913670136711367213673136741367513676136771367813679136801368113682136831368413685136861368713688136891369013691136921369313694136951369613697136981369913700137011370213703137041370513706137071370813709137101371113712137131371413715137161371713718137191372013721137221372313724137251372613727137281372913730137311373213733137341373513736137371373813739137401374113742137431374413745137461374713748137491375013751137521375313754137551375613757137581375913760137611376213763137641376513766137671376813769137701377113772137731377413775137761377713778137791378013781137821378313784137851378613787137881378913790137911379213793137941379513796137971379813799138001380113802138031380413805138061380713808138091381013811138121381313814138151381613817138181381913820138211382213823138241382513826138271382813829138301383113832138331383413835138361383713838138391384013841138421384313844138451384613847138481384913850138511385213853138541385513856138571385813859138601386113862138631386413865138661386713868138691387013871138721387313874138751387613877138781387913880138811388213883138841388513886138871388813889138901389113892138931389413895138961389713898138991390013901139021390313904139051390613907139081390913910139111391213913139141391513916139171391813919139201392113922139231392413925139261392713928139291393013931139321393313934139351393613937139381393913940139411394213943139441394513946139471394813949139501395113952139531395413955139561395713958139591396013961139621396313964139651396613967139681396913970139711397213973139741397513976139771397813979139801398113982139831398413985139861398713988139891399013991139921399313994139951399613997139981399914000140011400214003140041400514006140071400814009140101401114012140131401414015140161401714018140191402014021140221402314024140251402614027140281402914030140311403214033140341403514036140371403814039140401404114042140431404414045140461404714048140491405014051140521405314054140551405614057140581405914060140611406214063140641406514066140671406814069140701407114072140731407414075140761407714078140791408014081140821408314084140851408614087140881408914090140911409214093140941409514096140971409814099141001410114102141031410414105141061410714108141091411014111141121411314114141151411614117141181411914120141211412214123141241412514126141271412814129141301413114132141331413414135141361413714138141391414014141141421414314144141451414614147141481414914150141511415214153141541415514156141571415814159141601416114162141631416414165141661416714168141691417014171141721417314174141751417614177141781417914180141811418214183141841418514186141871418814189141901419114192141931419414195141961419714198141991420014201142021420314204142051420614207142081420914210142111421214213142141421514216142171421814219142201422114222142231422414225142261422714228142291423014231142321423314234142351423614237142381423914240142411424214243142441424514246142471424814249142501425114252142531425414255142561425714258142591426014261142621426314264142651426614267142681426914270142711427214273142741427514276142771427814279142801428114282142831428414285142861428714288142891429014291142921429314294142951429614297142981429914300143011430214303143041430514306143071430814309143101431114312143131431414315143161431714318143191432014321143221432314324143251432614327143281432914330143311433214333143341433514336143371433814339143401434114342143431434414345143461434714348143491435014351143521435314354143551435614357143581435914360143611436214363143641436514366143671436814369143701437114372143731437414375143761437714378143791438014381143821438314384143851438614387143881438914390143911439214393143941439514396143971439814399144001440114402144031440414405144061440714408144091441014411144121441314414144151441614417144181441914420144211442214423144241442514426144271442814429144301443114432144331443414435144361443714438144391444014441144421444314444144451444614447144481444914450144511445214453144541445514456144571445814459144601446114462144631446414465144661446714468144691447014471144721447314474144751447614477144781447914480144811448214483144841448514486144871448814489144901449114492144931449414495144961449714498144991450014501145021450314504145051450614507145081450914510145111451214513145141451514516145171451814519145201452114522145231452414525145261452714528145291453014531145321453314534145351453614537145381453914540145411454214543145441454514546145471454814549145501455114552145531455414555145561455714558145591456014561145621456314564145651456614567145681456914570145711457214573145741457514576145771457814579145801458114582145831458414585145861458714588145891459014591145921459314594145951459614597145981459914600146011460214603146041460514606146071460814609146101461114612146131461414615146161461714618146191462014621146221462314624146251462614627146281462914630146311463214633146341463514636146371463814639146401464114642146431464414645146461464714648146491465014651146521465314654146551465614657146581465914660146611466214663146641466514666146671466814669146701467114672146731467414675146761467714678146791468014681146821468314684146851468614687146881468914690146911469214693146941469514696146971469814699147001470114702147031470414705147061470714708147091471014711147121471314714147151471614717147181471914720147211472214723147241472514726147271472814729147301473114732147331473414735147361473714738147391474014741147421474314744147451474614747147481474914750147511475214753147541475514756147571475814759147601476114762147631476414765147661476714768147691477014771147721477314774147751477614777147781477914780147811478214783147841478514786147871478814789147901479114792147931479414795147961479714798147991480014801148021480314804148051480614807148081480914810148111481214813148141481514816148171481814819148201482114822148231482414825148261482714828148291483014831148321483314834148351483614837148381483914840148411484214843148441484514846148471484814849148501485114852148531485414855148561485714858148591486014861148621486314864148651486614867148681486914870148711487214873148741487514876148771487814879148801488114882148831488414885148861488714888148891489014891148921489314894148951489614897148981489914900149011490214903149041490514906149071490814909149101491114912149131491414915149161491714918149191492014921149221492314924149251492614927149281492914930149311493214933149341493514936149371493814939149401494114942149431494414945149461494714948149491495014951149521495314954149551495614957149581495914960149611496214963149641496514966149671496814969149701497114972149731497414975149761497714978149791498014981149821498314984149851498614987149881498914990149911499214993149941499514996149971499814999150001500115002150031500415005150061500715008150091501015011150121501315014150151501615017150181501915020150211502215023150241502515026150271502815029150301503115032150331503415035150361503715038150391504015041150421504315044150451504615047150481504915050150511505215053150541505515056150571505815059150601506115062150631506415065150661506715068150691507015071150721507315074150751507615077150781507915080150811508215083150841508515086150871508815089150901509115092150931509415095150961509715098150991510015101151021510315104151051510615107151081510915110151111511215113151141511515116151171511815119151201512115122151231512415125151261512715128151291513015131151321513315134151351513615137151381513915140151411514215143151441514515146151471514815149151501515115152151531515415155151561515715158151591516015161151621516315164151651516615167151681516915170151711517215173151741517515176151771517815179151801518115182151831518415185151861518715188151891519015191151921519315194151951519615197151981519915200152011520215203152041520515206152071520815209152101521115212152131521415215152161521715218152191522015221152221522315224152251522615227152281522915230152311523215233152341523515236152371523815239152401524115242152431524415245152461524715248152491525015251152521525315254152551525615257152581525915260152611526215263152641526515266152671526815269152701527115272152731527415275152761527715278152791528015281152821528315284152851528615287152881528915290152911529215293152941529515296152971529815299153001530115302153031530415305153061530715308153091531015311153121531315314153151531615317153181531915320153211532215323153241532515326153271532815329153301533115332153331533415335153361533715338153391534015341153421534315344153451534615347153481534915350153511535215353153541535515356153571535815359153601536115362153631536415365153661536715368153691537015371153721537315374153751537615377153781537915380153811538215383153841538515386153871538815389153901539115392153931539415395153961539715398153991540015401154021540315404154051540615407154081540915410154111541215413154141541515416154171541815419154201542115422154231542415425154261542715428154291543015431154321543315434154351543615437154381543915440154411544215443154441544515446154471544815449154501545115452154531545415455154561545715458154591546015461154621546315464154651546615467154681546915470154711547215473154741547515476154771547815479154801548115482154831548415485154861548715488154891549015491154921549315494154951549615497154981549915500155011550215503155041550515506155071550815509155101551115512155131551415515155161551715518155191552015521155221552315524155251552615527155281552915530155311553215533155341553515536155371553815539155401554115542155431554415545155461554715548155491555015551155521555315554155551555615557155581555915560155611556215563155641556515566155671556815569155701557115572155731557415575155761557715578155791558015581155821558315584155851558615587155881558915590155911559215593155941559515596155971559815599156001560115602156031560415605156061560715608156091561015611156121561315614156151561615617156181561915620156211562215623156241562515626156271562815629156301563115632156331563415635156361563715638156391564015641156421564315644156451564615647156481564915650156511565215653156541565515656156571565815659156601566115662156631566415665156661566715668156691567015671156721567315674156751567615677156781567915680156811568215683156841568515686156871568815689156901569115692156931569415695156961569715698156991570015701157021570315704157051570615707157081570915710157111571215713157141571515716157171571815719157201572115722157231572415725157261572715728157291573015731157321573315734157351573615737157381573915740157411574215743157441574515746157471574815749157501575115752157531575415755157561575715758157591576015761157621576315764157651576615767157681576915770157711577215773157741577515776157771577815779157801578115782157831578415785157861578715788157891579015791157921579315794157951579615797157981579915800158011580215803158041580515806158071580815809158101581115812158131581415815158161581715818158191582015821158221582315824158251582615827158281582915830158311583215833158341583515836158371583815839158401584115842158431584415845158461584715848158491585015851158521585315854158551585615857158581585915860158611586215863158641586515866158671586815869158701587115872158731587415875158761587715878158791588015881158821588315884158851588615887158881588915890158911589215893158941589515896158971589815899159001590115902159031590415905159061590715908159091591015911159121591315914159151591615917159181591915920159211592215923159241592515926159271592815929159301593115932159331593415935159361593715938159391594015941159421594315944159451594615947159481594915950159511595215953159541595515956159571595815959159601596115962159631596415965159661596715968159691597015971159721597315974159751597615977159781597915980159811598215983159841598515986159871598815989159901599115992159931599415995159961599715998159991600016001160021600316004160051600616007160081600916010160111601216013160141601516016160171601816019160201602116022160231602416025160261602716028160291603016031160321603316034160351603616037160381603916040160411604216043160441604516046160471604816049160501605116052160531605416055160561605716058160591606016061160621606316064160651606616067160681606916070160711607216073160741607516076160771607816079160801608116082160831608416085160861608716088160891609016091160921609316094160951609616097160981609916100161011610216103161041610516106161071610816109161101611116112161131611416115161161611716118161191612016121161221612316124161251612616127161281612916130161311613216133161341613516136161371613816139161401614116142161431614416145161461614716148161491615016151161521615316154161551615616157161581615916160161611616216163161641616516166161671616816169161701617116172161731617416175161761617716178161791618016181161821618316184161851618616187161881618916190161911619216193161941619516196161971619816199162001620116202162031620416205162061620716208162091621016211162121621316214162151621616217162181621916220162211622216223162241622516226162271622816229162301623116232162331623416235162361623716238162391624016241162421624316244162451624616247162481624916250162511625216253162541625516256162571625816259162601626116262162631626416265162661626716268162691627016271162721627316274162751627616277162781627916280162811628216283162841628516286162871628816289162901629116292162931629416295162961629716298162991630016301163021630316304163051630616307163081630916310163111631216313163141631516316163171631816319163201632116322163231632416325163261632716328163291633016331163321633316334163351633616337163381633916340163411634216343163441634516346163471634816349163501635116352163531635416355163561635716358163591636016361163621636316364163651636616367163681636916370163711637216373163741637516376163771637816379163801638116382163831638416385163861638716388163891639016391163921639316394163951639616397163981639916400164011640216403164041640516406164071640816409164101641116412164131641416415164161641716418164191642016421164221642316424164251642616427164281642916430164311643216433164341643516436164371643816439164401644116442164431644416445164461644716448164491645016451164521645316454164551645616457164581645916460164611646216463164641646516466164671646816469164701647116472164731647416475164761647716478164791648016481164821648316484164851648616487164881648916490164911649216493164941649516496164971649816499165001650116502165031650416505165061650716508165091651016511165121651316514165151651616517165181651916520165211652216523165241652516526165271652816529165301653116532165331653416535165361653716538165391654016541165421654316544165451654616547165481654916550165511655216553165541655516556165571655816559165601656116562165631656416565165661656716568165691657016571165721657316574165751657616577165781657916580165811658216583165841658516586165871658816589165901659116592165931659416595165961659716598165991660016601166021660316604166051660616607166081660916610166111661216613166141661516616166171661816619166201662116622166231662416625166261662716628166291663016631166321663316634166351663616637166381663916640166411664216643166441664516646166471664816649166501665116652166531665416655166561665716658166591666016661166621666316664166651666616667166681666916670166711667216673166741667516676166771667816679166801668116682166831668416685166861668716688166891669016691166921669316694166951669616697166981669916700167011670216703167041670516706167071670816709167101671116712167131671416715167161671716718167191672016721167221672316724167251672616727167281672916730167311673216733167341673516736167371673816739167401674116742167431674416745167461674716748167491675016751167521675316754167551675616757167581675916760167611676216763167641676516766167671676816769167701677116772167731677416775167761677716778167791678016781167821678316784167851678616787167881678916790167911679216793167941679516796167971679816799168001680116802168031680416805168061680716808168091681016811168121681316814168151681616817168181681916820168211682216823168241682516826168271682816829168301683116832168331683416835168361683716838168391684016841168421684316844168451684616847168481684916850168511685216853168541685516856168571685816859168601686116862168631686416865168661686716868168691687016871168721687316874168751687616877168781687916880168811688216883168841688516886168871688816889168901689116892168931689416895168961689716898168991690016901169021690316904169051690616907169081690916910169111691216913169141691516916169171691816919169201692116922169231692416925169261692716928169291693016931169321693316934169351693616937169381693916940169411694216943169441694516946169471694816949169501695116952169531695416955169561695716958169591696016961169621696316964169651696616967169681696916970169711697216973169741697516976169771697816979169801698116982169831698416985169861698716988169891699016991169921699316994169951699616997169981699917000170011700217003170041700517006170071700817009170101701117012170131701417015170161701717018170191702017021170221702317024170251702617027170281702917030170311703217033170341703517036170371703817039170401704117042170431704417045170461704717048170491705017051170521705317054170551705617057170581705917060170611706217063170641706517066170671706817069170701707117072170731707417075170761707717078170791708017081170821708317084170851708617087170881708917090170911709217093170941709517096170971709817099171001710117102171031710417105171061710717108171091711017111171121711317114171151711617117171181711917120171211712217123171241712517126171271712817129171301713117132171331713417135171361713717138171391714017141171421714317144171451714617147171481714917150171511715217153171541715517156171571715817159171601716117162171631716417165171661716717168171691717017171171721717317174171751717617177171781717917180171811718217183171841718517186171871718817189171901719117192171931719417195171961719717198171991720017201172021720317204172051720617207172081720917210172111721217213172141721517216172171721817219172201722117222172231722417225172261722717228172291723017231172321723317234172351723617237172381723917240172411724217243172441724517246172471724817249172501725117252172531725417255172561725717258172591726017261172621726317264172651726617267172681726917270172711727217273172741727517276172771727817279172801728117282172831728417285172861728717288172891729017291172921729317294172951729617297172981729917300173011730217303173041730517306173071730817309173101731117312173131731417315173161731717318173191732017321173221732317324173251732617327173281732917330173311733217333173341733517336173371733817339173401734117342173431734417345173461734717348173491735017351173521735317354173551735617357173581735917360173611736217363173641736517366173671736817369173701737117372173731737417375173761737717378173791738017381173821738317384173851738617387173881738917390173911739217393173941739517396173971739817399174001740117402174031740417405174061740717408174091741017411174121741317414174151741617417174181741917420174211742217423174241742517426174271742817429174301743117432174331743417435174361743717438174391744017441174421744317444174451744617447174481744917450174511745217453174541745517456174571745817459174601746117462174631746417465174661746717468174691747017471174721747317474174751747617477174781747917480174811748217483174841748517486174871748817489174901749117492174931749417495174961749717498174991750017501175021750317504175051750617507175081750917510175111751217513175141751517516175171751817519175201752117522175231752417525175261752717528175291753017531175321753317534175351753617537175381753917540175411754217543175441754517546175471754817549175501755117552175531755417555175561755717558175591756017561175621756317564175651756617567175681756917570175711757217573175741757517576175771757817579175801758117582175831758417585175861758717588175891759017591175921759317594175951759617597175981759917600176011760217603176041760517606176071760817609176101761117612176131761417615176161761717618176191762017621176221762317624176251762617627176281762917630176311763217633176341763517636176371763817639176401764117642176431764417645176461764717648176491765017651176521765317654176551765617657176581765917660176611766217663176641766517666176671766817669176701767117672176731767417675176761767717678176791768017681176821768317684176851768617687176881768917690176911769217693176941769517696176971769817699177001770117702177031770417705177061770717708177091771017711177121771317714177151771617717177181771917720177211772217723177241772517726177271772817729177301773117732177331773417735177361773717738177391774017741177421774317744177451774617747177481774917750177511775217753177541775517756177571775817759177601776117762177631776417765177661776717768177691777017771177721777317774177751777617777177781777917780177811778217783177841778517786177871778817789177901779117792177931779417795177961779717798177991780017801178021780317804178051780617807178081780917810178111781217813178141781517816178171781817819178201782117822178231782417825178261782717828178291783017831178321783317834178351783617837178381783917840178411784217843178441784517846178471784817849178501785117852178531785417855178561785717858178591786017861178621786317864178651786617867178681786917870178711787217873178741787517876178771787817879178801788117882178831788417885178861788717888178891789017891178921789317894178951789617897178981789917900179011790217903179041790517906179071790817909179101791117912179131791417915179161791717918179191792017921179221792317924179251792617927179281792917930179311793217933179341793517936179371793817939179401794117942179431794417945179461794717948179491795017951179521795317954179551795617957179581795917960179611796217963179641796517966179671796817969179701797117972179731797417975179761797717978179791798017981179821798317984179851798617987179881798917990179911799217993179941799517996179971799817999180001800118002180031800418005180061800718008180091801018011180121801318014180151801618017180181801918020180211802218023180241802518026180271802818029180301803118032180331803418035180361803718038180391804018041180421804318044180451804618047180481804918050180511805218053180541805518056180571805818059180601806118062180631806418065180661806718068180691807018071180721807318074180751807618077180781807918080180811808218083180841808518086180871808818089180901809118092180931809418095180961809718098180991810018101181021810318104181051810618107181081810918110181111811218113181141811518116181171811818119181201812118122181231812418125181261812718128181291813018131181321813318134181351813618137181381813918140181411814218143181441814518146181471814818149181501815118152181531815418155181561815718158181591816018161181621816318164181651816618167181681816918170181711817218173181741817518176181771817818179181801818118182181831818418185181861818718188181891819018191181921819318194181951819618197181981819918200182011820218203182041820518206182071820818209182101821118212182131821418215182161821718218182191822018221182221822318224182251822618227182281822918230182311823218233182341823518236182371823818239182401824118242182431824418245182461824718248182491825018251182521825318254182551825618257182581825918260182611826218263182641826518266182671826818269182701827118272182731827418275182761827718278182791828018281182821828318284182851828618287182881828918290182911829218293182941829518296182971829818299183001830118302183031830418305183061830718308183091831018311183121831318314183151831618317183181831918320183211832218323183241832518326183271832818329183301833118332183331833418335183361833718338183391834018341183421834318344183451834618347183481834918350183511835218353183541835518356183571835818359183601836118362183631836418365183661836718368183691837018371183721837318374183751837618377183781837918380183811838218383183841838518386183871838818389183901839118392183931839418395183961839718398183991840018401184021840318404184051840618407184081840918410184111841218413184141841518416184171841818419184201842118422184231842418425184261842718428184291843018431184321843318434184351843618437184381843918440184411844218443184441844518446184471844818449184501845118452184531845418455184561845718458184591846018461184621846318464184651846618467184681846918470184711847218473184741847518476184771847818479184801848118482184831848418485184861848718488184891849018491184921849318494184951849618497184981849918500185011850218503185041850518506185071850818509185101851118512185131851418515185161851718518185191852018521185221852318524185251852618527185281852918530185311853218533185341853518536185371853818539185401854118542185431854418545185461854718548185491855018551185521855318554185551855618557185581855918560185611856218563185641856518566185671856818569185701857118572185731857418575185761857718578185791858018581185821858318584185851858618587185881858918590185911859218593185941859518596185971859818599186001860118602186031860418605186061860718608186091861018611186121861318614186151861618617186181861918620186211862218623186241862518626186271862818629186301863118632186331863418635186361863718638186391864018641186421864318644186451864618647186481864918650186511865218653186541865518656186571865818659186601866118662186631866418665186661866718668186691867018671186721867318674186751867618677186781867918680186811868218683186841868518686186871868818689186901869118692186931869418695186961869718698186991870018701187021870318704187051870618707187081870918710187111871218713187141871518716187171871818719187201872118722187231872418725187261872718728187291873018731187321873318734187351873618737187381873918740187411874218743187441874518746187471874818749187501875118752187531875418755187561875718758187591876018761187621876318764187651876618767187681876918770187711877218773187741877518776187771877818779187801878118782187831878418785187861878718788187891879018791187921879318794187951879618797187981879918800188011880218803188041880518806188071880818809188101881118812188131881418815188161881718818188191882018821188221882318824188251882618827188281882918830188311883218833188341883518836188371883818839188401884118842188431884418845188461884718848188491885018851188521885318854188551885618857188581885918860188611886218863188641886518866188671886818869188701887118872188731887418875188761887718878188791888018881188821888318884188851888618887188881888918890188911889218893188941889518896188971889818899189001890118902189031890418905189061890718908189091891018911189121891318914189151891618917189181891918920189211892218923189241892518926189271892818929189301893118932189331893418935189361893718938189391894018941189421894318944189451894618947189481894918950189511895218953189541895518956189571895818959189601896118962189631896418965189661896718968189691897018971189721897318974189751897618977189781897918980189811898218983189841898518986189871898818989189901899118992189931899418995189961899718998189991900019001190021900319004190051900619007190081900919010190111901219013190141901519016190171901819019190201902119022190231902419025190261902719028190291903019031190321903319034190351903619037190381903919040190411904219043190441904519046190471904819049190501905119052190531905419055190561905719058190591906019061190621906319064190651906619067190681906919070190711907219073190741907519076190771907819079190801908119082190831908419085190861908719088190891909019091190921909319094190951909619097190981909919100191011910219103191041910519106191071910819109191101911119112191131911419115191161911719118191191912019121191221912319124191251912619127191281912919130191311913219133191341913519136191371913819139191401914119142191431914419145191461914719148191491915019151191521915319154191551915619157191581915919160191611916219163191641916519166191671916819169191701917119172191731917419175191761917719178191791918019181191821918319184191851918619187191881918919190191911919219193191941919519196191971919819199192001920119202192031920419205192061920719208192091921019211192121921319214192151921619217192181921919220192211922219223192241922519226192271922819229192301923119232192331923419235192361923719238192391924019241192421924319244192451924619247192481924919250192511925219253192541925519256192571925819259192601926119262192631926419265192661926719268192691927019271192721927319274192751927619277192781927919280192811928219283192841928519286192871928819289192901929119292192931929419295192961929719298192991930019301193021930319304193051930619307193081930919310193111931219313193141931519316193171931819319193201932119322193231932419325193261932719328193291933019331193321933319334193351933619337193381933919340193411934219343193441934519346193471934819349193501935119352193531935419355193561935719358193591936019361193621936319364193651936619367193681936919370193711937219373193741937519376193771937819379193801938119382193831938419385193861938719388193891939019391193921939319394193951939619397193981939919400194011940219403194041940519406194071940819409194101941119412194131941419415194161941719418194191942019421194221942319424194251942619427194281942919430194311943219433194341943519436194371943819439194401944119442194431944419445194461944719448194491945019451194521945319454194551945619457194581945919460194611946219463194641946519466194671946819469194701947119472194731947419475194761947719478194791948019481194821948319484194851948619487194881948919490194911949219493194941949519496194971949819499195001950119502195031950419505195061950719508195091951019511195121951319514195151951619517195181951919520195211952219523195241952519526195271952819529195301953119532195331953419535195361953719538195391954019541195421954319544195451954619547195481954919550195511955219553195541955519556195571955819559195601956119562195631956419565195661956719568195691957019571195721957319574195751957619577195781957919580195811958219583195841958519586195871958819589195901959119592195931959419595195961959719598195991960019601196021960319604196051960619607196081960919610196111961219613196141961519616196171961819619196201962119622196231962419625196261962719628196291963019631196321963319634196351963619637196381963919640196411964219643196441964519646196471964819649196501965119652196531965419655196561965719658196591966019661196621966319664196651966619667196681966919670196711967219673196741967519676196771967819679196801968119682196831968419685196861968719688196891969019691196921969319694196951969619697196981969919700197011970219703197041970519706197071970819709197101971119712197131971419715197161971719718197191972019721197221972319724197251972619727197281972919730197311973219733197341973519736197371973819739197401974119742197431974419745197461974719748197491975019751197521975319754197551975619757197581975919760197611976219763197641976519766197671976819769197701977119772197731977419775197761977719778197791978019781197821978319784197851978619787197881978919790197911979219793197941979519796197971979819799198001980119802198031980419805198061980719808198091981019811198121981319814198151981619817198181981919820198211982219823198241982519826198271982819829198301983119832198331983419835198361983719838198391984019841198421984319844198451984619847198481984919850198511985219853198541985519856198571985819859198601986119862198631986419865198661986719868198691987019871198721987319874198751987619877198781987919880198811988219883198841988519886198871988819889198901989119892198931989419895198961989719898198991990019901199021990319904199051990619907199081990919910199111991219913199141991519916199171991819919199201992119922199231992419925199261992719928199291993019931199321993319934199351993619937199381993919940199411994219943199441994519946199471994819949199501995119952199531995419955199561995719958199591996019961199621996319964199651996619967199681996919970199711997219973199741997519976199771997819979199801998119982199831998419985199861998719988199891999019991199921999319994199951999619997199981999920000200012000220003200042000520006200072000820009200102001120012200132001420015200162001720018200192002020021200222002320024200252002620027200282002920030200312003220033200342003520036200372003820039200402004120042200432004420045200462004720048200492005020051200522005320054200552005620057200582005920060200612006220063200642006520066200672006820069200702007120072200732007420075200762007720078200792008020081200822008320084200852008620087200882008920090200912009220093200942009520096200972009820099201002010120102201032010420105201062010720108201092011020111201122011320114201152011620117201182011920120201212012220123201242012520126201272012820129201302013120132201332013420135201362013720138201392014020141201422014320144201452014620147201482014920150201512015220153201542015520156201572015820159201602016120162201632016420165201662016720168201692017020171201722017320174201752017620177201782017920180201812018220183201842018520186201872018820189201902019120192201932019420195201962019720198201992020020201202022020320204202052020620207202082020920210202112021220213202142021520216202172021820219202202022120222202232022420225202262022720228202292023020231202322023320234202352023620237202382023920240202412024220243202442024520246202472024820249202502025120252202532025420255202562025720258202592026020261202622026320264202652026620267202682026920270202712027220273202742027520276202772027820279202802028120282202832028420285202862028720288202892029020291202922029320294202952029620297202982029920300203012030220303203042030520306203072030820309203102031120312203132031420315203162031720318203192032020321203222032320324203252032620327203282032920330203312033220333203342033520336203372033820339203402034120342203432034420345203462034720348203492035020351203522035320354203552035620357203582035920360203612036220363203642036520366203672036820369203702037120372203732037420375203762037720378203792038020381203822038320384203852038620387203882038920390203912039220393203942039520396203972039820399204002040120402204032040420405204062040720408204092041020411204122041320414204152041620417204182041920420204212042220423204242042520426204272042820429204302043120432204332043420435204362043720438204392044020441204422044320444204452044620447204482044920450204512045220453204542045520456204572045820459204602046120462204632046420465204662046720468204692047020471204722047320474204752047620477204782047920480204812048220483204842048520486204872048820489204902049120492204932049420495204962049720498204992050020501205022050320504205052050620507205082050920510205112051220513205142051520516205172051820519205202052120522205232052420525205262052720528205292053020531205322053320534205352053620537205382053920540205412054220543205442054520546205472054820549205502055120552205532055420555205562055720558205592056020561205622056320564205652056620567205682056920570205712057220573205742057520576205772057820579205802058120582205832058420585205862058720588205892059020591205922059320594205952059620597205982059920600206012060220603206042060520606206072060820609206102061120612206132061420615206162061720618206192062020621206222062320624206252062620627206282062920630206312063220633206342063520636206372063820639206402064120642206432064420645206462064720648206492065020651206522065320654206552065620657206582065920660206612066220663206642066520666206672066820669206702067120672206732067420675206762067720678206792068020681206822068320684206852068620687206882068920690206912069220693206942069520696206972069820699207002070120702207032070420705207062070720708207092071020711207122071320714207152071620717207182071920720207212072220723207242072520726207272072820729207302073120732207332073420735207362073720738207392074020741207422074320744207452074620747207482074920750207512075220753207542075520756207572075820759207602076120762207632076420765207662076720768207692077020771207722077320774207752077620777207782077920780207812078220783207842078520786207872078820789207902079120792207932079420795207962079720798207992080020801208022080320804208052080620807208082080920810208112081220813208142081520816208172081820819208202082120822208232082420825208262082720828208292083020831208322083320834208352083620837208382083920840208412084220843208442084520846208472084820849208502085120852208532085420855208562085720858208592086020861208622086320864208652086620867208682086920870208712087220873208742087520876208772087820879208802088120882208832088420885208862088720888208892089020891208922089320894208952089620897208982089920900209012090220903209042090520906209072090820909209102091120912209132091420915209162091720918209192092020921209222092320924209252092620927209282092920930209312093220933209342093520936209372093820939209402094120942209432094420945209462094720948209492095020951209522095320954209552095620957209582095920960209612096220963209642096520966209672096820969209702097120972209732097420975209762097720978209792098020981209822098320984209852098620987209882098920990209912099220993209942099520996209972099820999210002100121002210032100421005210062100721008210092101021011210122101321014210152101621017210182101921020210212102221023210242102521026210272102821029210302103121032210332103421035210362103721038210392104021041210422104321044210452104621047210482104921050210512105221053210542105521056210572105821059210602106121062210632106421065210662106721068210692107021071210722107321074210752107621077210782107921080210812108221083210842108521086210872108821089210902109121092210932109421095210962109721098210992110021101211022110321104211052110621107211082110921110211112111221113211142111521116211172111821119211202112121122211232112421125211262112721128211292113021131211322113321134211352113621137211382113921140211412114221143211442114521146211472114821149211502115121152211532115421155211562115721158211592116021161211622116321164211652116621167211682116921170211712117221173211742117521176211772117821179211802118121182211832118421185211862118721188211892119021191211922119321194211952119621197211982119921200212012120221203212042120521206212072120821209212102121121212212132121421215212162121721218212192122021221212222122321224212252122621227212282122921230212312123221233212342123521236212372123821239212402124121242212432124421245212462124721248212492125021251212522125321254212552125621257212582125921260212612126221263212642126521266212672126821269212702127121272212732127421275212762127721278212792128021281212822128321284212852128621287212882128921290212912129221293212942129521296212972129821299213002130121302213032130421305213062130721308213092131021311213122131321314213152131621317213182131921320213212132221323213242132521326213272132821329213302133121332213332133421335213362133721338213392134021341213422134321344213452134621347213482134921350213512135221353213542135521356213572135821359213602136121362213632136421365213662136721368213692137021371213722137321374213752137621377213782137921380213812138221383213842138521386213872138821389213902139121392213932139421395213962139721398213992140021401214022140321404214052140621407214082140921410214112141221413214142141521416214172141821419214202142121422214232142421425214262142721428214292143021431214322143321434214352143621437214382143921440214412144221443214442144521446214472144821449214502145121452214532145421455214562145721458214592146021461214622146321464214652146621467214682146921470214712147221473214742147521476214772147821479214802148121482214832148421485214862148721488214892149021491214922149321494214952149621497214982149921500215012150221503215042150521506215072150821509215102151121512215132151421515215162151721518215192152021521215222152321524215252152621527215282152921530215312153221533215342153521536215372153821539215402154121542215432154421545215462154721548215492155021551215522155321554215552155621557215582155921560215612156221563215642156521566215672156821569215702157121572215732157421575215762157721578215792158021581215822158321584215852158621587215882158921590215912159221593215942159521596215972159821599216002160121602216032160421605216062160721608216092161021611216122161321614216152161621617216182161921620216212162221623216242162521626216272162821629216302163121632216332163421635216362163721638216392164021641216422164321644216452164621647216482164921650216512165221653216542165521656216572165821659216602166121662216632166421665216662166721668216692167021671216722167321674216752167621677216782167921680216812168221683216842168521686216872168821689216902169121692216932169421695216962169721698216992170021701217022170321704217052170621707217082170921710217112171221713217142171521716217172171821719217202172121722217232172421725217262172721728217292173021731217322173321734217352173621737217382173921740217412174221743217442174521746217472174821749217502175121752217532175421755217562175721758217592176021761217622176321764217652176621767217682176921770217712177221773217742177521776217772177821779217802178121782217832178421785217862178721788217892179021791217922179321794217952179621797217982179921800218012180221803218042180521806218072180821809218102181121812218132181421815218162181721818218192182021821218222182321824218252182621827218282182921830218312183221833218342183521836218372183821839218402184121842218432184421845218462184721848218492185021851218522185321854218552185621857218582185921860218612186221863218642186521866218672186821869218702187121872218732187421875218762187721878218792188021881218822188321884218852188621887218882188921890218912189221893218942189521896218972189821899219002190121902219032190421905219062190721908219092191021911219122191321914219152191621917219182191921920219212192221923219242192521926219272192821929219302193121932219332193421935219362193721938219392194021941219422194321944219452194621947219482194921950219512195221953219542195521956219572195821959219602196121962219632196421965219662196721968219692197021971219722197321974219752197621977219782197921980219812198221983219842198521986219872198821989219902199121992219932199421995219962199721998219992200022001220022200322004220052200622007220082200922010220112201222013220142201522016220172201822019220202202122022220232202422025220262202722028220292203022031220322203322034220352203622037220382203922040220412204222043220442204522046220472204822049220502205122052220532205422055220562205722058220592206022061220622206322064220652206622067220682206922070220712207222073220742207522076220772207822079220802208122082220832208422085220862208722088220892209022091220922209322094220952209622097220982209922100221012210222103221042210522106221072210822109221102211122112221132211422115221162211722118221192212022121221222212322124221252212622127221282212922130221312213222133221342213522136221372213822139221402214122142221432214422145221462214722148221492215022151221522215322154221552215622157221582215922160221612216222163221642216522166221672216822169221702217122172221732217422175221762217722178221792218022181221822218322184221852218622187221882218922190221912219222193221942219522196221972219822199222002220122202222032220422205222062220722208222092221022211222122221322214222152221622217222182221922220222212222222223222242222522226222272222822229222302223122232222332223422235222362223722238222392224022241222422224322244222452224622247222482224922250222512225222253222542225522256222572225822259222602226122262222632226422265222662226722268222692227022271222722227322274222752227622277222782227922280222812228222283222842228522286222872228822289222902229122292222932229422295222962229722298222992230022301223022230322304223052230622307223082230922310223112231222313223142231522316223172231822319223202232122322223232232422325223262232722328223292233022331223322233322334223352233622337223382233922340223412234222343223442234522346223472234822349223502235122352223532235422355223562235722358223592236022361223622236322364223652236622367223682236922370223712237222373223742237522376223772237822379223802238122382223832238422385223862238722388223892239022391223922239322394223952239622397223982239922400224012240222403224042240522406224072240822409224102241122412224132241422415224162241722418224192242022421224222242322424224252242622427224282242922430224312243222433224342243522436224372243822439224402244122442224432244422445224462244722448224492245022451224522245322454224552245622457224582245922460224612246222463224642246522466224672246822469224702247122472224732247422475224762247722478224792248022481224822248322484224852248622487224882248922490224912249222493224942249522496224972249822499225002250122502225032250422505225062250722508225092251022511225122251322514225152251622517225182251922520225212252222523225242252522526225272252822529225302253122532225332253422535225362253722538225392254022541225422254322544225452254622547225482254922550225512255222553225542255522556225572255822559225602256122562225632256422565225662256722568225692257022571225722257322574225752257622577225782257922580225812258222583225842258522586225872258822589225902259122592225932259422595225962259722598225992260022601226022260322604226052260622607226082260922610226112261222613226142261522616226172261822619226202262122622226232262422625226262262722628226292263022631226322263322634226352263622637226382263922640226412264222643226442264522646226472264822649226502265122652226532265422655226562265722658226592266022661226622266322664226652266622667226682266922670226712267222673226742267522676226772267822679226802268122682226832268422685226862268722688226892269022691226922269322694226952269622697226982269922700227012270222703227042270522706227072270822709227102271122712227132271422715227162271722718227192272022721227222272322724227252272622727227282272922730227312273222733227342273522736227372273822739227402274122742227432274422745227462274722748227492275022751227522275322754227552275622757227582275922760227612276222763227642276522766227672276822769227702277122772227732277422775227762277722778227792278022781227822278322784227852278622787227882278922790227912279222793227942279522796227972279822799228002280122802228032280422805228062280722808228092281022811228122281322814228152281622817228182281922820228212282222823228242282522826228272282822829228302283122832228332283422835228362283722838228392284022841228422284322844228452284622847228482284922850228512285222853228542285522856228572285822859228602286122862228632286422865228662286722868228692287022871228722287322874228752287622877228782287922880228812288222883228842288522886228872288822889228902289122892228932289422895228962289722898228992290022901229022290322904229052290622907229082290922910229112291222913229142291522916229172291822919229202292122922229232292422925229262292722928229292293022931229322293322934229352293622937229382293922940229412294222943229442294522946229472294822949229502295122952229532295422955229562295722958229592296022961229622296322964229652296622967229682296922970229712297222973229742297522976229772297822979229802298122982229832298422985229862298722988229892299022991229922299322994229952299622997229982299923000230012300223003230042300523006230072300823009230102301123012230132301423015230162301723018230192302023021230222302323024230252302623027230282302923030230312303223033230342303523036230372303823039230402304123042230432304423045230462304723048230492305023051230522305323054230552305623057230582305923060230612306223063230642306523066230672306823069230702307123072230732307423075230762307723078230792308023081230822308323084230852308623087230882308923090230912309223093230942309523096230972309823099231002310123102231032310423105231062310723108231092311023111231122311323114231152311623117231182311923120231212312223123231242312523126231272312823129231302313123132231332313423135231362313723138231392314023141231422314323144231452314623147231482314923150231512315223153231542315523156231572315823159231602316123162231632316423165231662316723168231692317023171231722317323174231752317623177231782317923180231812318223183231842318523186231872318823189231902319123192231932319423195231962319723198231992320023201232022320323204232052320623207232082320923210232112321223213232142321523216232172321823219232202322123222232232322423225232262322723228232292323023231232322323323234232352323623237232382323923240232412324223243232442324523246232472324823249232502325123252232532325423255232562325723258232592326023261232622326323264232652326623267232682326923270232712327223273232742327523276232772327823279232802328123282232832328423285232862328723288232892329023291232922329323294232952329623297232982329923300233012330223303233042330523306233072330823309233102331123312233132331423315233162331723318233192332023321233222332323324233252332623327233282332923330233312333223333233342333523336233372333823339233402334123342233432334423345233462334723348233492335023351233522335323354233552335623357233582335923360233612336223363233642336523366233672336823369233702337123372233732337423375233762337723378233792338023381233822338323384233852338623387233882338923390233912339223393233942339523396233972339823399234002340123402234032340423405234062340723408234092341023411234122341323414234152341623417234182341923420234212342223423234242342523426234272342823429234302343123432234332343423435234362343723438234392344023441234422344323444234452344623447234482344923450234512345223453234542345523456234572345823459234602346123462234632346423465234662346723468234692347023471234722347323474234752347623477234782347923480234812348223483234842348523486234872348823489234902349123492234932349423495234962349723498234992350023501235022350323504235052350623507235082350923510235112351223513235142351523516235172351823519235202352123522235232352423525235262352723528235292353023531235322353323534235352353623537235382353923540235412354223543235442354523546235472354823549235502355123552235532355423555235562355723558235592356023561235622356323564235652356623567235682356923570235712357223573235742357523576235772357823579235802358123582235832358423585235862358723588235892359023591235922359323594235952359623597235982359923600236012360223603236042360523606236072360823609236102361123612236132361423615236162361723618236192362023621236222362323624236252362623627236282362923630236312363223633236342363523636236372363823639236402364123642236432364423645236462364723648236492365023651236522365323654236552365623657236582365923660236612366223663236642366523666236672366823669236702367123672236732367423675236762367723678236792368023681236822368323684236852368623687236882368923690236912369223693236942369523696236972369823699237002370123702237032370423705237062370723708237092371023711237122371323714237152371623717237182371923720237212372223723237242372523726237272372823729237302373123732237332373423735237362373723738237392374023741237422374323744237452374623747237482374923750237512375223753237542375523756237572375823759237602376123762237632376423765237662376723768237692377023771237722377323774237752377623777237782377923780237812378223783237842378523786237872378823789237902379123792237932379423795237962379723798237992380023801238022380323804238052380623807238082380923810238112381223813238142381523816238172381823819238202382123822238232382423825238262382723828238292383023831238322383323834238352383623837238382383923840238412384223843238442384523846238472384823849238502385123852238532385423855238562385723858238592386023861238622386323864238652386623867238682386923870238712387223873238742387523876238772387823879238802388123882238832388423885238862388723888238892389023891238922389323894238952389623897238982389923900239012390223903239042390523906239072390823909239102391123912239132391423915239162391723918239192392023921239222392323924239252392623927239282392923930239312393223933239342393523936239372393823939239402394123942239432394423945239462394723948239492395023951239522395323954239552395623957239582395923960239612396223963239642396523966239672396823969239702397123972239732397423975239762397723978239792398023981239822398323984239852398623987239882398923990239912399223993239942399523996239972399823999240002400124002240032400424005240062400724008240092401024011240122401324014240152401624017240182401924020240212402224023240242402524026240272402824029240302403124032240332403424035240362403724038240392404024041240422404324044240452404624047240482404924050
  1. //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file implements the AArch64TargetLowering class.
  10. //
  11. //===----------------------------------------------------------------------===//
  12. #include "AArch64ISelLowering.h"
  13. #include "AArch64CallingConvention.h"
  14. #include "AArch64ExpandImm.h"
  15. #include "AArch64MachineFunctionInfo.h"
  16. #include "AArch64PerfectShuffle.h"
  17. #include "AArch64RegisterInfo.h"
  18. #include "AArch64Subtarget.h"
  19. #include "MCTargetDesc/AArch64AddressingModes.h"
  20. #include "Utils/AArch64BaseInfo.h"
  21. #include "llvm/ADT/APFloat.h"
  22. #include "llvm/ADT/APInt.h"
  23. #include "llvm/ADT/ArrayRef.h"
  24. #include "llvm/ADT/STLExtras.h"
  25. #include "llvm/ADT/SmallSet.h"
  26. #include "llvm/ADT/SmallVector.h"
  27. #include "llvm/ADT/Statistic.h"
  28. #include "llvm/ADT/StringRef.h"
  29. #include "llvm/ADT/Triple.h"
  30. #include "llvm/ADT/Twine.h"
  31. #include "llvm/Analysis/LoopInfo.h"
  32. #include "llvm/Analysis/MemoryLocation.h"
  33. #include "llvm/Analysis/ObjCARCUtil.h"
  34. #include "llvm/Analysis/TargetTransformInfo.h"
  35. #include "llvm/Analysis/ValueTracking.h"
  36. #include "llvm/Analysis/VectorUtils.h"
  37. #include "llvm/CodeGen/Analysis.h"
  38. #include "llvm/CodeGen/CallingConvLower.h"
  39. #include "llvm/CodeGen/ISDOpcodes.h"
  40. #include "llvm/CodeGen/MachineBasicBlock.h"
  41. #include "llvm/CodeGen/MachineFrameInfo.h"
  42. #include "llvm/CodeGen/MachineFunction.h"
  43. #include "llvm/CodeGen/MachineInstr.h"
  44. #include "llvm/CodeGen/MachineInstrBuilder.h"
  45. #include "llvm/CodeGen/MachineMemOperand.h"
  46. #include "llvm/CodeGen/MachineRegisterInfo.h"
  47. #include "llvm/CodeGen/RuntimeLibcalls.h"
  48. #include "llvm/CodeGen/SelectionDAG.h"
  49. #include "llvm/CodeGen/SelectionDAGNodes.h"
  50. #include "llvm/CodeGen/TargetCallingConv.h"
  51. #include "llvm/CodeGen/TargetInstrInfo.h"
  52. #include "llvm/CodeGen/ValueTypes.h"
  53. #include "llvm/IR/Attributes.h"
  54. #include "llvm/IR/Constants.h"
  55. #include "llvm/IR/DataLayout.h"
  56. #include "llvm/IR/DebugLoc.h"
  57. #include "llvm/IR/DerivedTypes.h"
  58. #include "llvm/IR/Function.h"
  59. #include "llvm/IR/GetElementPtrTypeIterator.h"
  60. #include "llvm/IR/GlobalValue.h"
  61. #include "llvm/IR/IRBuilder.h"
  62. #include "llvm/IR/Instruction.h"
  63. #include "llvm/IR/Instructions.h"
  64. #include "llvm/IR/IntrinsicInst.h"
  65. #include "llvm/IR/Intrinsics.h"
  66. #include "llvm/IR/IntrinsicsAArch64.h"
  67. #include "llvm/IR/Module.h"
  68. #include "llvm/IR/OperandTraits.h"
  69. #include "llvm/IR/PatternMatch.h"
  70. #include "llvm/IR/Type.h"
  71. #include "llvm/IR/Use.h"
  72. #include "llvm/IR/Value.h"
  73. #include "llvm/MC/MCRegisterInfo.h"
  74. #include "llvm/Support/Casting.h"
  75. #include "llvm/Support/CodeGen.h"
  76. #include "llvm/Support/CommandLine.h"
  77. #include "llvm/Support/Compiler.h"
  78. #include "llvm/Support/Debug.h"
  79. #include "llvm/Support/ErrorHandling.h"
  80. #include "llvm/Support/InstructionCost.h"
  81. #include "llvm/Support/KnownBits.h"
  82. #include "llvm/Support/MachineValueType.h"
  83. #include "llvm/Support/MathExtras.h"
  84. #include "llvm/Support/raw_ostream.h"
  85. #include "llvm/Target/TargetMachine.h"
  86. #include "llvm/Target/TargetOptions.h"
  87. #include <algorithm>
  88. #include <bitset>
  89. #include <cassert>
  90. #include <cctype>
  91. #include <cstdint>
  92. #include <cstdlib>
  93. #include <iterator>
  94. #include <limits>
  95. #include <optional>
  96. #include <tuple>
  97. #include <utility>
  98. #include <vector>
  99. using namespace llvm;
  100. using namespace llvm::PatternMatch;
  101. #define DEBUG_TYPE "aarch64-lower"
  102. STATISTIC(NumTailCalls, "Number of tail calls");
  103. STATISTIC(NumShiftInserts, "Number of vector shift inserts");
  104. STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
  105. // FIXME: The necessary dtprel relocations don't seem to be supported
  106. // well in the GNU bfd and gold linkers at the moment. Therefore, by
  107. // default, for now, fall back to GeneralDynamic code generation.
  108. cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
  109. "aarch64-elf-ldtls-generation", cl::Hidden,
  110. cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
  111. cl::init(false));
  112. static cl::opt<bool>
  113. EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
  114. cl::desc("Enable AArch64 logical imm instruction "
  115. "optimization"),
  116. cl::init(true));
  117. // Temporary option added for the purpose of testing functionality added
  118. // to DAGCombiner.cpp in D92230. It is expected that this can be removed
  119. // in future when both implementations will be based off MGATHER rather
  120. // than the GLD1 nodes added for the SVE gather load intrinsics.
  121. static cl::opt<bool>
  122. EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
  123. cl::desc("Combine extends of AArch64 masked "
  124. "gather intrinsics"),
  125. cl::init(true));
  126. // All of the XOR, OR and CMP use ALU ports, and data dependency will become the
  127. // bottleneck after this transform on high end CPU. So this max leaf node
  128. // limitation is guard cmp+ccmp will be profitable.
  129. static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
  130. cl::desc("Maximum of xors"));
  131. /// Value type used for condition codes.
  132. static const MVT MVT_CC = MVT::i32;
  133. static inline EVT getPackedSVEVectorVT(EVT VT) {
  134. switch (VT.getSimpleVT().SimpleTy) {
  135. default:
  136. llvm_unreachable("unexpected element type for vector");
  137. case MVT::i8:
  138. return MVT::nxv16i8;
  139. case MVT::i16:
  140. return MVT::nxv8i16;
  141. case MVT::i32:
  142. return MVT::nxv4i32;
  143. case MVT::i64:
  144. return MVT::nxv2i64;
  145. case MVT::f16:
  146. return MVT::nxv8f16;
  147. case MVT::f32:
  148. return MVT::nxv4f32;
  149. case MVT::f64:
  150. return MVT::nxv2f64;
  151. case MVT::bf16:
  152. return MVT::nxv8bf16;
  153. }
  154. }
  155. // NOTE: Currently there's only a need to return integer vector types. If this
  156. // changes then just add an extra "type" parameter.
  157. static inline EVT getPackedSVEVectorVT(ElementCount EC) {
  158. switch (EC.getKnownMinValue()) {
  159. default:
  160. llvm_unreachable("unexpected element count for vector");
  161. case 16:
  162. return MVT::nxv16i8;
  163. case 8:
  164. return MVT::nxv8i16;
  165. case 4:
  166. return MVT::nxv4i32;
  167. case 2:
  168. return MVT::nxv2i64;
  169. }
  170. }
  171. static inline EVT getPromotedVTForPredicate(EVT VT) {
  172. assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
  173. "Expected scalable predicate vector type!");
  174. switch (VT.getVectorMinNumElements()) {
  175. default:
  176. llvm_unreachable("unexpected element count for vector");
  177. case 2:
  178. return MVT::nxv2i64;
  179. case 4:
  180. return MVT::nxv4i32;
  181. case 8:
  182. return MVT::nxv8i16;
  183. case 16:
  184. return MVT::nxv16i8;
  185. }
  186. }
  187. /// Returns true if VT's elements occupy the lowest bit positions of its
  188. /// associated register class without any intervening space.
  189. ///
  190. /// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
  191. /// same register class, but only nxv8f16 can be treated as a packed vector.
  192. static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
  193. assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
  194. "Expected legal vector type!");
  195. return VT.isFixedLengthVector() ||
  196. VT.getSizeInBits().getKnownMinValue() == AArch64::SVEBitsPerBlock;
  197. }
  198. // Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
  199. // predicate and end with a passthru value matching the result type.
  200. static bool isMergePassthruOpcode(unsigned Opc) {
  201. switch (Opc) {
  202. default:
  203. return false;
  204. case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
  205. case AArch64ISD::BSWAP_MERGE_PASSTHRU:
  206. case AArch64ISD::REVH_MERGE_PASSTHRU:
  207. case AArch64ISD::REVW_MERGE_PASSTHRU:
  208. case AArch64ISD::REVD_MERGE_PASSTHRU:
  209. case AArch64ISD::CTLZ_MERGE_PASSTHRU:
  210. case AArch64ISD::CTPOP_MERGE_PASSTHRU:
  211. case AArch64ISD::DUP_MERGE_PASSTHRU:
  212. case AArch64ISD::ABS_MERGE_PASSTHRU:
  213. case AArch64ISD::NEG_MERGE_PASSTHRU:
  214. case AArch64ISD::FNEG_MERGE_PASSTHRU:
  215. case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
  216. case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
  217. case AArch64ISD::FCEIL_MERGE_PASSTHRU:
  218. case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
  219. case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
  220. case AArch64ISD::FRINT_MERGE_PASSTHRU:
  221. case AArch64ISD::FROUND_MERGE_PASSTHRU:
  222. case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
  223. case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
  224. case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
  225. case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
  226. case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
  227. case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
  228. case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
  229. case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
  230. case AArch64ISD::FSQRT_MERGE_PASSTHRU:
  231. case AArch64ISD::FRECPX_MERGE_PASSTHRU:
  232. case AArch64ISD::FABS_MERGE_PASSTHRU:
  233. return true;
  234. }
  235. }
  236. // Returns true if inactive lanes are known to be zeroed by construction.
  237. static bool isZeroingInactiveLanes(SDValue Op) {
  238. switch (Op.getOpcode()) {
  239. default:
  240. // We guarantee i1 splat_vectors to zero the other lanes by
  241. // implementing it with ptrue and possibly a punpklo for nxv1i1.
  242. if (ISD::isConstantSplatVectorAllOnes(Op.getNode()))
  243. return true;
  244. return false;
  245. case AArch64ISD::PTRUE:
  246. case AArch64ISD::SETCC_MERGE_ZERO:
  247. return true;
  248. case ISD::INTRINSIC_WO_CHAIN:
  249. switch (Op.getConstantOperandVal(0)) {
  250. default:
  251. return false;
  252. case Intrinsic::aarch64_sve_ptrue:
  253. case Intrinsic::aarch64_sve_pnext:
  254. case Intrinsic::aarch64_sve_cmpeq:
  255. case Intrinsic::aarch64_sve_cmpne:
  256. case Intrinsic::aarch64_sve_cmpge:
  257. case Intrinsic::aarch64_sve_cmpgt:
  258. case Intrinsic::aarch64_sve_cmphs:
  259. case Intrinsic::aarch64_sve_cmphi:
  260. case Intrinsic::aarch64_sve_cmpeq_wide:
  261. case Intrinsic::aarch64_sve_cmpne_wide:
  262. case Intrinsic::aarch64_sve_cmpge_wide:
  263. case Intrinsic::aarch64_sve_cmpgt_wide:
  264. case Intrinsic::aarch64_sve_cmplt_wide:
  265. case Intrinsic::aarch64_sve_cmple_wide:
  266. case Intrinsic::aarch64_sve_cmphs_wide:
  267. case Intrinsic::aarch64_sve_cmphi_wide:
  268. case Intrinsic::aarch64_sve_cmplo_wide:
  269. case Intrinsic::aarch64_sve_cmpls_wide:
  270. case Intrinsic::aarch64_sve_fcmpeq:
  271. case Intrinsic::aarch64_sve_fcmpne:
  272. case Intrinsic::aarch64_sve_fcmpge:
  273. case Intrinsic::aarch64_sve_fcmpgt:
  274. case Intrinsic::aarch64_sve_fcmpuo:
  275. case Intrinsic::aarch64_sve_facgt:
  276. case Intrinsic::aarch64_sve_facge:
  277. case Intrinsic::aarch64_sve_whilege:
  278. case Intrinsic::aarch64_sve_whilegt:
  279. case Intrinsic::aarch64_sve_whilehi:
  280. case Intrinsic::aarch64_sve_whilehs:
  281. case Intrinsic::aarch64_sve_whilele:
  282. case Intrinsic::aarch64_sve_whilelo:
  283. case Intrinsic::aarch64_sve_whilels:
  284. case Intrinsic::aarch64_sve_whilelt:
  285. case Intrinsic::aarch64_sve_match:
  286. case Intrinsic::aarch64_sve_nmatch:
  287. case Intrinsic::aarch64_sve_whilege_x2:
  288. case Intrinsic::aarch64_sve_whilegt_x2:
  289. case Intrinsic::aarch64_sve_whilehi_x2:
  290. case Intrinsic::aarch64_sve_whilehs_x2:
  291. case Intrinsic::aarch64_sve_whilele_x2:
  292. case Intrinsic::aarch64_sve_whilelo_x2:
  293. case Intrinsic::aarch64_sve_whilels_x2:
  294. case Intrinsic::aarch64_sve_whilelt_x2:
  295. return true;
  296. }
  297. }
  298. }
  299. AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
  300. const AArch64Subtarget &STI)
  301. : TargetLowering(TM), Subtarget(&STI) {
  302. // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
  303. // we have to make something up. Arbitrarily, choose ZeroOrOne.
  304. setBooleanContents(ZeroOrOneBooleanContent);
  305. // When comparing vectors the result sets the different elements in the
  306. // vector to all-one or all-zero.
  307. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
  308. // Set up the register classes.
  309. addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
  310. addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
  311. if (Subtarget->hasLS64()) {
  312. addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
  313. setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
  314. setOperationAction(ISD::STORE, MVT::i64x8, Custom);
  315. }
  316. if (Subtarget->hasFPARMv8()) {
  317. addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
  318. addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
  319. addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
  320. addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
  321. addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
  322. }
  323. if (Subtarget->hasNEON()) {
  324. addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
  325. addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
  326. // Someone set us up the NEON.
  327. addDRTypeForNEON(MVT::v2f32);
  328. addDRTypeForNEON(MVT::v8i8);
  329. addDRTypeForNEON(MVT::v4i16);
  330. addDRTypeForNEON(MVT::v2i32);
  331. addDRTypeForNEON(MVT::v1i64);
  332. addDRTypeForNEON(MVT::v1f64);
  333. addDRTypeForNEON(MVT::v4f16);
  334. if (Subtarget->hasBF16())
  335. addDRTypeForNEON(MVT::v4bf16);
  336. addQRTypeForNEON(MVT::v4f32);
  337. addQRTypeForNEON(MVT::v2f64);
  338. addQRTypeForNEON(MVT::v16i8);
  339. addQRTypeForNEON(MVT::v8i16);
  340. addQRTypeForNEON(MVT::v4i32);
  341. addQRTypeForNEON(MVT::v2i64);
  342. addQRTypeForNEON(MVT::v8f16);
  343. if (Subtarget->hasBF16())
  344. addQRTypeForNEON(MVT::v8bf16);
  345. }
  346. if (Subtarget->hasSVEorSME()) {
  347. // Add legal sve predicate types
  348. addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
  349. addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
  350. addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
  351. addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
  352. addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
  353. // Add legal sve data types
  354. addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
  355. addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
  356. addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
  357. addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
  358. addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
  359. addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
  360. addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
  361. addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
  362. addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
  363. addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
  364. if (Subtarget->hasBF16()) {
  365. addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
  366. addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
  367. addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
  368. }
  369. if (Subtarget->useSVEForFixedLengthVectors()) {
  370. for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
  371. if (useSVEForFixedLengthVectorVT(VT))
  372. addRegisterClass(VT, &AArch64::ZPRRegClass);
  373. for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
  374. if (useSVEForFixedLengthVectorVT(VT))
  375. addRegisterClass(VT, &AArch64::ZPRRegClass);
  376. }
  377. }
  378. // Compute derived properties from the register classes
  379. computeRegisterProperties(Subtarget->getRegisterInfo());
  380. // Provide all sorts of operation actions
  381. setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
  382. setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
  383. setOperationAction(ISD::SETCC, MVT::i32, Custom);
  384. setOperationAction(ISD::SETCC, MVT::i64, Custom);
  385. setOperationAction(ISD::SETCC, MVT::f16, Custom);
  386. setOperationAction(ISD::SETCC, MVT::f32, Custom);
  387. setOperationAction(ISD::SETCC, MVT::f64, Custom);
  388. setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
  389. setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
  390. setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
  391. setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
  392. setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
  393. setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
  394. setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
  395. setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
  396. setOperationAction(ISD::BRCOND, MVT::Other, Custom);
  397. setOperationAction(ISD::BR_CC, MVT::i32, Custom);
  398. setOperationAction(ISD::BR_CC, MVT::i64, Custom);
  399. setOperationAction(ISD::BR_CC, MVT::f16, Custom);
  400. setOperationAction(ISD::BR_CC, MVT::f32, Custom);
  401. setOperationAction(ISD::BR_CC, MVT::f64, Custom);
  402. setOperationAction(ISD::SELECT, MVT::i32, Custom);
  403. setOperationAction(ISD::SELECT, MVT::i64, Custom);
  404. setOperationAction(ISD::SELECT, MVT::f16, Custom);
  405. setOperationAction(ISD::SELECT, MVT::bf16, Custom);
  406. setOperationAction(ISD::SELECT, MVT::f32, Custom);
  407. setOperationAction(ISD::SELECT, MVT::f64, Custom);
  408. setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  409. setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
  410. setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
  411. setOperationAction(ISD::SELECT_CC, MVT::bf16, Expand);
  412. setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  413. setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
  414. setOperationAction(ISD::BR_JT, MVT::Other, Custom);
  415. setOperationAction(ISD::JumpTable, MVT::i64, Custom);
  416. setOperationAction(ISD::SETCCCARRY, MVT::i64, Custom);
  417. setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
  418. setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
  419. setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
  420. setOperationAction(ISD::FREM, MVT::f32, Expand);
  421. setOperationAction(ISD::FREM, MVT::f64, Expand);
  422. setOperationAction(ISD::FREM, MVT::f80, Expand);
  423. setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
  424. // Custom lowering hooks are needed for XOR
  425. // to fold it into CSINC/CSINV.
  426. setOperationAction(ISD::XOR, MVT::i32, Custom);
  427. setOperationAction(ISD::XOR, MVT::i64, Custom);
  428. // Virtually no operation on f128 is legal, but LLVM can't expand them when
  429. // there's a valid register class, so we need custom operations in most cases.
  430. setOperationAction(ISD::FABS, MVT::f128, Expand);
  431. setOperationAction(ISD::FADD, MVT::f128, LibCall);
  432. setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
  433. setOperationAction(ISD::FCOS, MVT::f128, Expand);
  434. setOperationAction(ISD::FDIV, MVT::f128, LibCall);
  435. setOperationAction(ISD::FMA, MVT::f128, Expand);
  436. setOperationAction(ISD::FMUL, MVT::f128, LibCall);
  437. setOperationAction(ISD::FNEG, MVT::f128, Expand);
  438. setOperationAction(ISD::FPOW, MVT::f128, Expand);
  439. setOperationAction(ISD::FREM, MVT::f128, Expand);
  440. setOperationAction(ISD::FRINT, MVT::f128, Expand);
  441. setOperationAction(ISD::FSIN, MVT::f128, Expand);
  442. setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
  443. setOperationAction(ISD::FSQRT, MVT::f128, Expand);
  444. setOperationAction(ISD::FSUB, MVT::f128, LibCall);
  445. setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
  446. setOperationAction(ISD::SETCC, MVT::f128, Custom);
  447. setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
  448. setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom);
  449. setOperationAction(ISD::BR_CC, MVT::f128, Custom);
  450. setOperationAction(ISD::SELECT, MVT::f128, Custom);
  451. setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
  452. setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
  453. // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
  454. // aren't handled.
  455. // Lowering for many of the conversions is actually specified by the non-f128
  456. // type. The LowerXXX function will be trivial when f128 isn't involved.
  457. setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
  458. setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
  459. setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
  460. setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
  461. setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
  462. setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
  463. setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
  464. setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
  465. setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
  466. setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
  467. setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
  468. setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
  469. setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
  470. setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
  471. setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
  472. setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
  473. setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
  474. setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
  475. setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
  476. setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
  477. setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
  478. setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
  479. setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
  480. setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
  481. setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
  482. setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
  483. setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
  484. setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
  485. setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
  486. setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
  487. setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom);
  488. setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
  489. setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom);
  490. setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
  491. // Variable arguments.
  492. setOperationAction(ISD::VASTART, MVT::Other, Custom);
  493. setOperationAction(ISD::VAARG, MVT::Other, Custom);
  494. setOperationAction(ISD::VACOPY, MVT::Other, Custom);
  495. setOperationAction(ISD::VAEND, MVT::Other, Expand);
  496. // Variable-sized objects.
  497. setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
  498. setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
  499. if (Subtarget->isTargetWindows())
  500. setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
  501. else
  502. setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
  503. // Constant pool entries
  504. setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
  505. // BlockAddress
  506. setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
  507. // AArch64 lacks both left-rotate and popcount instructions.
  508. setOperationAction(ISD::ROTL, MVT::i32, Expand);
  509. setOperationAction(ISD::ROTL, MVT::i64, Expand);
  510. for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
  511. setOperationAction(ISD::ROTL, VT, Expand);
  512. setOperationAction(ISD::ROTR, VT, Expand);
  513. }
  514. // AArch64 doesn't have i32 MULH{S|U}.
  515. setOperationAction(ISD::MULHU, MVT::i32, Expand);
  516. setOperationAction(ISD::MULHS, MVT::i32, Expand);
  517. // AArch64 doesn't have {U|S}MUL_LOHI.
  518. setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
  519. setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
  520. if (Subtarget->hasCSSC()) {
  521. setOperationAction(ISD::CTPOP, MVT::i32, Legal);
  522. setOperationAction(ISD::CTPOP, MVT::i64, Legal);
  523. setOperationAction(ISD::CTPOP, MVT::i128, Expand);
  524. setOperationAction(ISD::PARITY, MVT::i128, Expand);
  525. setOperationAction(ISD::CTTZ, MVT::i32, Legal);
  526. setOperationAction(ISD::CTTZ, MVT::i64, Legal);
  527. setOperationAction(ISD::CTTZ, MVT::i128, Expand);
  528. setOperationAction(ISD::ABS, MVT::i32, Legal);
  529. setOperationAction(ISD::ABS, MVT::i64, Legal);
  530. setOperationAction(ISD::SMAX, MVT::i32, Legal);
  531. setOperationAction(ISD::SMAX, MVT::i64, Legal);
  532. setOperationAction(ISD::UMAX, MVT::i32, Legal);
  533. setOperationAction(ISD::UMAX, MVT::i64, Legal);
  534. setOperationAction(ISD::SMIN, MVT::i32, Legal);
  535. setOperationAction(ISD::SMIN, MVT::i64, Legal);
  536. setOperationAction(ISD::UMIN, MVT::i32, Legal);
  537. setOperationAction(ISD::UMIN, MVT::i64, Legal);
  538. } else {
  539. setOperationAction(ISD::CTPOP, MVT::i32, Custom);
  540. setOperationAction(ISD::CTPOP, MVT::i64, Custom);
  541. setOperationAction(ISD::CTPOP, MVT::i128, Custom);
  542. setOperationAction(ISD::PARITY, MVT::i64, Custom);
  543. setOperationAction(ISD::PARITY, MVT::i128, Custom);
  544. setOperationAction(ISD::ABS, MVT::i32, Custom);
  545. setOperationAction(ISD::ABS, MVT::i64, Custom);
  546. }
  547. setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
  548. setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
  549. for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
  550. setOperationAction(ISD::SDIVREM, VT, Expand);
  551. setOperationAction(ISD::UDIVREM, VT, Expand);
  552. }
  553. setOperationAction(ISD::SREM, MVT::i32, Expand);
  554. setOperationAction(ISD::SREM, MVT::i64, Expand);
  555. setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
  556. setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
  557. setOperationAction(ISD::UREM, MVT::i32, Expand);
  558. setOperationAction(ISD::UREM, MVT::i64, Expand);
  559. // Custom lower Add/Sub/Mul with overflow.
  560. setOperationAction(ISD::SADDO, MVT::i32, Custom);
  561. setOperationAction(ISD::SADDO, MVT::i64, Custom);
  562. setOperationAction(ISD::UADDO, MVT::i32, Custom);
  563. setOperationAction(ISD::UADDO, MVT::i64, Custom);
  564. setOperationAction(ISD::SSUBO, MVT::i32, Custom);
  565. setOperationAction(ISD::SSUBO, MVT::i64, Custom);
  566. setOperationAction(ISD::USUBO, MVT::i32, Custom);
  567. setOperationAction(ISD::USUBO, MVT::i64, Custom);
  568. setOperationAction(ISD::SMULO, MVT::i32, Custom);
  569. setOperationAction(ISD::SMULO, MVT::i64, Custom);
  570. setOperationAction(ISD::UMULO, MVT::i32, Custom);
  571. setOperationAction(ISD::UMULO, MVT::i64, Custom);
  572. setOperationAction(ISD::ADDCARRY, MVT::i32, Custom);
  573. setOperationAction(ISD::ADDCARRY, MVT::i64, Custom);
  574. setOperationAction(ISD::SUBCARRY, MVT::i32, Custom);
  575. setOperationAction(ISD::SUBCARRY, MVT::i64, Custom);
  576. setOperationAction(ISD::SADDO_CARRY, MVT::i32, Custom);
  577. setOperationAction(ISD::SADDO_CARRY, MVT::i64, Custom);
  578. setOperationAction(ISD::SSUBO_CARRY, MVT::i32, Custom);
  579. setOperationAction(ISD::SSUBO_CARRY, MVT::i64, Custom);
  580. setOperationAction(ISD::FSIN, MVT::f32, Expand);
  581. setOperationAction(ISD::FSIN, MVT::f64, Expand);
  582. setOperationAction(ISD::FCOS, MVT::f32, Expand);
  583. setOperationAction(ISD::FCOS, MVT::f64, Expand);
  584. setOperationAction(ISD::FPOW, MVT::f32, Expand);
  585. setOperationAction(ISD::FPOW, MVT::f64, Expand);
  586. setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
  587. setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
  588. if (Subtarget->hasFullFP16())
  589. setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom);
  590. else
  591. setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
  592. for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
  593. ISD::FCOS, ISD::FSIN, ISD::FSINCOS,
  594. ISD::FEXP, ISD::FEXP2, ISD::FLOG,
  595. ISD::FLOG2, ISD::FLOG10, ISD::STRICT_FREM,
  596. ISD::STRICT_FPOW, ISD::STRICT_FPOWI, ISD::STRICT_FCOS,
  597. ISD::STRICT_FSIN, ISD::STRICT_FEXP, ISD::STRICT_FEXP2,
  598. ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10}) {
  599. setOperationAction(Op, MVT::f16, Promote);
  600. setOperationAction(Op, MVT::v4f16, Expand);
  601. setOperationAction(Op, MVT::v8f16, Expand);
  602. }
  603. if (!Subtarget->hasFullFP16()) {
  604. for (auto Op :
  605. {ISD::SETCC, ISD::SELECT_CC,
  606. ISD::BR_CC, ISD::FADD, ISD::FSUB,
  607. ISD::FMUL, ISD::FDIV, ISD::FMA,
  608. ISD::FNEG, ISD::FABS, ISD::FCEIL,
  609. ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT,
  610. ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN,
  611. ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM,
  612. ISD::FMINIMUM, ISD::FMAXIMUM, ISD::STRICT_FADD,
  613. ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV,
  614. ISD::STRICT_FMA, ISD::STRICT_FCEIL, ISD::STRICT_FFLOOR,
  615. ISD::STRICT_FSQRT, ISD::STRICT_FRINT, ISD::STRICT_FNEARBYINT,
  616. ISD::STRICT_FROUND, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN,
  617. ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM,
  618. ISD::STRICT_FMAXIMUM})
  619. setOperationAction(Op, MVT::f16, Promote);
  620. // Round-to-integer need custom lowering for fp16, as Promote doesn't work
  621. // because the result type is integer.
  622. for (auto Op : {ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT,
  623. ISD::STRICT_LLRINT})
  624. setOperationAction(Op, MVT::f16, Custom);
  625. // promote v4f16 to v4f32 when that is known to be safe.
  626. setOperationPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
  627. setOperationPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
  628. setOperationPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
  629. setOperationPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
  630. setOperationAction(ISD::FABS, MVT::v4f16, Expand);
  631. setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
  632. setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
  633. setOperationAction(ISD::FROUNDEVEN, MVT::v4f16, Expand);
  634. setOperationAction(ISD::FMA, MVT::v4f16, Expand);
  635. setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
  636. setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
  637. setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
  638. setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
  639. setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
  640. setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
  641. setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
  642. setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
  643. setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
  644. setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
  645. setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);
  646. setOperationAction(ISD::FABS, MVT::v8f16, Expand);
  647. setOperationAction(ISD::FADD, MVT::v8f16, Expand);
  648. setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
  649. setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
  650. setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
  651. setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
  652. setOperationAction(ISD::FMA, MVT::v8f16, Expand);
  653. setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
  654. setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
  655. setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
  656. setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
  657. setOperationAction(ISD::FROUNDEVEN, MVT::v8f16, Expand);
  658. setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
  659. setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
  660. setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
  661. setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
  662. setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
  663. setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
  664. setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
  665. setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
  666. setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
  667. }
  668. // AArch64 has implementations of a lot of rounding-like FP operations.
  669. for (auto Op :
  670. {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
  671. ISD::FRINT, ISD::FTRUNC, ISD::FROUND,
  672. ISD::FROUNDEVEN, ISD::FMINNUM, ISD::FMAXNUM,
  673. ISD::FMINIMUM, ISD::FMAXIMUM, ISD::LROUND,
  674. ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
  675. ISD::STRICT_FFLOOR, ISD::STRICT_FCEIL, ISD::STRICT_FNEARBYINT,
  676. ISD::STRICT_FRINT, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN,
  677. ISD::STRICT_FROUND, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM,
  678. ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_LROUND,
  679. ISD::STRICT_LLROUND, ISD::STRICT_LRINT, ISD::STRICT_LLRINT}) {
  680. for (MVT Ty : {MVT::f32, MVT::f64})
  681. setOperationAction(Op, Ty, Legal);
  682. if (Subtarget->hasFullFP16())
  683. setOperationAction(Op, MVT::f16, Legal);
  684. }
  685. // Basic strict FP operations are legal
  686. for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
  687. ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) {
  688. for (MVT Ty : {MVT::f32, MVT::f64})
  689. setOperationAction(Op, Ty, Legal);
  690. if (Subtarget->hasFullFP16())
  691. setOperationAction(Op, MVT::f16, Legal);
  692. }
  693. // Strict conversion to a larger type is legal
  694. for (auto VT : {MVT::f32, MVT::f64})
  695. setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
  696. setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
  697. setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
  698. setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
  699. setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
  700. setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
  701. setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
  702. setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
  703. setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
  704. // Generate outline atomics library calls only if LSE was not specified for
  705. // subtarget
  706. if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
  707. setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall);
  708. setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall);
  709. setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
  710. setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall);
  711. setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall);
  712. setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall);
  713. setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall);
  714. setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
  715. setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall);
  716. setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall);
  717. setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall);
  718. setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
  719. setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall);
  720. setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall);
  721. setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall);
  722. setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
  723. setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall);
  724. setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall);
  725. setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall);
  726. setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall);
  727. setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall);
  728. setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall);
  729. setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);
  730. setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
  731. setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);
  732. #define LCALLNAMES(A, B, N) \
  733. setLibcallName(A##N##_RELAX, #B #N "_relax"); \
  734. setLibcallName(A##N##_ACQ, #B #N "_acq"); \
  735. setLibcallName(A##N##_REL, #B #N "_rel"); \
  736. setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
  737. #define LCALLNAME4(A, B) \
  738. LCALLNAMES(A, B, 1) \
  739. LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
  740. #define LCALLNAME5(A, B) \
  741. LCALLNAMES(A, B, 1) \
  742. LCALLNAMES(A, B, 2) \
  743. LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
  744. LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
  745. LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
  746. LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
  747. LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
  748. LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
  749. LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
  750. #undef LCALLNAMES
  751. #undef LCALLNAME4
  752. #undef LCALLNAME5
  753. }
  754. // 128-bit loads and stores can be done without expanding
  755. setOperationAction(ISD::LOAD, MVT::i128, Custom);
  756. setOperationAction(ISD::STORE, MVT::i128, Custom);
  757. // Aligned 128-bit loads and stores are single-copy atomic according to the
  758. // v8.4a spec.
  759. if (Subtarget->hasLSE2()) {
  760. setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
  761. setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
  762. }
  763. // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
  764. // custom lowering, as there are no un-paired non-temporal stores and
  765. // legalization will break up 256 bit inputs.
  766. setOperationAction(ISD::STORE, MVT::v32i8, Custom);
  767. setOperationAction(ISD::STORE, MVT::v16i16, Custom);
  768. setOperationAction(ISD::STORE, MVT::v16f16, Custom);
  769. setOperationAction(ISD::STORE, MVT::v8i32, Custom);
  770. setOperationAction(ISD::STORE, MVT::v8f32, Custom);
  771. setOperationAction(ISD::STORE, MVT::v4f64, Custom);
  772. setOperationAction(ISD::STORE, MVT::v4i64, Custom);
  773. // 256 bit non-temporal loads can be lowered to LDNP. This is done using
  774. // custom lowering, as there are no un-paired non-temporal loads legalization
  775. // will break up 256 bit inputs.
  776. setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
  777. setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
  778. setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
  779. setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
  780. setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
  781. setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
  782. setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
  783. // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
  784. setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
  785. if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
  786. getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
  787. // Issue __sincos_stret if available.
  788. setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
  789. setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
  790. } else {
  791. setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
  792. setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
  793. }
  794. if (Subtarget->getTargetTriple().isOSMSVCRT()) {
  795. // MSVCRT doesn't have powi; fall back to pow
  796. setLibcallName(RTLIB::POWI_F32, nullptr);
  797. setLibcallName(RTLIB::POWI_F64, nullptr);
  798. }
  799. // Make floating-point constants legal for the large code model, so they don't
  800. // become loads from the constant pool.
  801. if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
  802. setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
  803. setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
  804. }
  805. // AArch64 does not have floating-point extending loads, i1 sign-extending
  806. // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
  807. for (MVT VT : MVT::fp_valuetypes()) {
  808. setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
  809. setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
  810. setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
  811. setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
  812. }
  813. for (MVT VT : MVT::integer_valuetypes())
  814. setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
  815. setTruncStoreAction(MVT::f32, MVT::f16, Expand);
  816. setTruncStoreAction(MVT::f64, MVT::f32, Expand);
  817. setTruncStoreAction(MVT::f64, MVT::f16, Expand);
  818. setTruncStoreAction(MVT::f128, MVT::f80, Expand);
  819. setTruncStoreAction(MVT::f128, MVT::f64, Expand);
  820. setTruncStoreAction(MVT::f128, MVT::f32, Expand);
  821. setTruncStoreAction(MVT::f128, MVT::f16, Expand);
  822. setOperationAction(ISD::BITCAST, MVT::i16, Custom);
  823. setOperationAction(ISD::BITCAST, MVT::f16, Custom);
  824. setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
  825. // Indexed loads and stores are supported.
  826. for (unsigned im = (unsigned)ISD::PRE_INC;
  827. im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
  828. setIndexedLoadAction(im, MVT::i8, Legal);
  829. setIndexedLoadAction(im, MVT::i16, Legal);
  830. setIndexedLoadAction(im, MVT::i32, Legal);
  831. setIndexedLoadAction(im, MVT::i64, Legal);
  832. setIndexedLoadAction(im, MVT::f64, Legal);
  833. setIndexedLoadAction(im, MVT::f32, Legal);
  834. setIndexedLoadAction(im, MVT::f16, Legal);
  835. setIndexedLoadAction(im, MVT::bf16, Legal);
  836. setIndexedStoreAction(im, MVT::i8, Legal);
  837. setIndexedStoreAction(im, MVT::i16, Legal);
  838. setIndexedStoreAction(im, MVT::i32, Legal);
  839. setIndexedStoreAction(im, MVT::i64, Legal);
  840. setIndexedStoreAction(im, MVT::f64, Legal);
  841. setIndexedStoreAction(im, MVT::f32, Legal);
  842. setIndexedStoreAction(im, MVT::f16, Legal);
  843. setIndexedStoreAction(im, MVT::bf16, Legal);
  844. }
  845. // Trap.
  846. setOperationAction(ISD::TRAP, MVT::Other, Legal);
  847. setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
  848. setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
  849. // We combine OR nodes for bitfield operations.
  850. setTargetDAGCombine(ISD::OR);
  851. // Try to create BICs for vector ANDs.
  852. setTargetDAGCombine(ISD::AND);
  853. // Vector add and sub nodes may conceal a high-half opportunity.
  854. // Also, try to fold ADD into CSINC/CSINV..
  855. setTargetDAGCombine({ISD::ADD, ISD::ABS, ISD::SUB, ISD::XOR, ISD::SINT_TO_FP,
  856. ISD::UINT_TO_FP});
  857. setTargetDAGCombine({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
  858. ISD::FP_TO_UINT_SAT, ISD::FDIV});
  859. // Try and combine setcc with csel
  860. setTargetDAGCombine(ISD::SETCC);
  861. setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
  862. setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND,
  863. ISD::VECTOR_SPLICE, ISD::SIGN_EXTEND_INREG,
  864. ISD::CONCAT_VECTORS, ISD::EXTRACT_SUBVECTOR,
  865. ISD::INSERT_SUBVECTOR, ISD::STORE, ISD::BUILD_VECTOR});
  866. setTargetDAGCombine(ISD::TRUNCATE);
  867. setTargetDAGCombine(ISD::LOAD);
  868. setTargetDAGCombine(ISD::MSTORE);
  869. setTargetDAGCombine(ISD::MUL);
  870. setTargetDAGCombine({ISD::SELECT, ISD::VSELECT});
  871. setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN,
  872. ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
  873. ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});
  874. setTargetDAGCombine({ISD::MGATHER, ISD::MSCATTER});
  875. setTargetDAGCombine(ISD::FP_EXTEND);
  876. setTargetDAGCombine(ISD::GlobalAddress);
  877. setTargetDAGCombine(ISD::CTLZ);
  878. // In case of strict alignment, avoid an excessive number of byte wide stores.
  879. MaxStoresPerMemsetOptSize = 8;
  880. MaxStoresPerMemset =
  881. Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
  882. MaxGluedStoresPerMemcpy = 4;
  883. MaxStoresPerMemcpyOptSize = 4;
  884. MaxStoresPerMemcpy =
  885. Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
  886. MaxStoresPerMemmoveOptSize = 4;
  887. MaxStoresPerMemmove = 4;
  888. MaxLoadsPerMemcmpOptSize = 4;
  889. MaxLoadsPerMemcmp =
  890. Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
  891. setStackPointerRegisterToSaveRestore(AArch64::SP);
  892. setSchedulingPreference(Sched::Hybrid);
  893. EnableExtLdPromotion = true;
  894. // Set required alignment.
  895. setMinFunctionAlignment(Align(4));
  896. // Set preferred alignments.
  897. setPrefLoopAlignment(Align(1ULL << STI.getPrefLoopLogAlignment()));
  898. setMaxBytesForAlignment(STI.getMaxBytesForLoopAlignment());
  899. setPrefFunctionAlignment(Align(1ULL << STI.getPrefFunctionLogAlignment()));
  900. // Only change the limit for entries in a jump table if specified by
  901. // the sub target, but not at the command line.
  902. unsigned MaxJT = STI.getMaximumJumpTableSize();
  903. if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
  904. setMaximumJumpTableSize(MaxJT);
  905. setHasExtractBitsInsn(true);
  906. setMaxDivRemBitWidthSupported(128);
  907. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  908. if (Subtarget->hasNEON()) {
  909. // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
  910. // silliness like this:
  911. for (auto Op :
  912. {ISD::SELECT, ISD::SELECT_CC, ISD::SETCC,
  913. ISD::BR_CC, ISD::FADD, ISD::FSUB,
  914. ISD::FMUL, ISD::FDIV, ISD::FMA,
  915. ISD::FNEG, ISD::FABS, ISD::FCEIL,
  916. ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT,
  917. ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN,
  918. ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM,
  919. ISD::FMINIMUM, ISD::FMAXIMUM, ISD::STRICT_FADD,
  920. ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV,
  921. ISD::STRICT_FMA, ISD::STRICT_FCEIL, ISD::STRICT_FFLOOR,
  922. ISD::STRICT_FSQRT, ISD::STRICT_FRINT, ISD::STRICT_FNEARBYINT,
  923. ISD::STRICT_FROUND, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN,
  924. ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM,
  925. ISD::STRICT_FMAXIMUM})
  926. setOperationAction(Op, MVT::v1f64, Expand);
  927. for (auto Op :
  928. {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP,
  929. ISD::FP_ROUND, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::MUL,
  930. ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT,
  931. ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::STRICT_FP_ROUND})
  932. setOperationAction(Op, MVT::v1i64, Expand);
  933. // AArch64 doesn't have a direct vector ->f32 conversion instructions for
  934. // elements smaller than i32, so promote the input to i32 first.
  935. setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
  936. setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
  937. // Similarly, there is no direct i32 -> f64 vector conversion instruction.
  938. // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
  939. // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
  940. for (auto Op : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
  941. ISD::STRICT_UINT_TO_FP})
  942. for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
  943. setOperationAction(Op, VT, Custom);
  944. if (Subtarget->hasFullFP16()) {
  945. setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
  946. setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Custom);
  947. setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
  948. setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Custom);
  949. setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
  950. setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
  951. setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
  952. setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
  953. setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
  954. } else {
  955. // when AArch64 doesn't have fullfp16 support, promote the input
  956. // to i32 first.
  957. setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
  958. setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
  959. setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
  960. setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
  961. setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
  962. setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
  963. setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
  964. setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
  965. }
  966. setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
  967. setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
  968. setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal);
  969. setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal);
  970. setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom);
  971. setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom);
  972. setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
  973. setOperationAction(ISD::BITREVERSE, MVT::v2i64, Custom);
  974. for (auto VT : {MVT::v1i64, MVT::v2i64}) {
  975. setOperationAction(ISD::UMAX, VT, Custom);
  976. setOperationAction(ISD::SMAX, VT, Custom);
  977. setOperationAction(ISD::UMIN, VT, Custom);
  978. setOperationAction(ISD::SMIN, VT, Custom);
  979. }
  980. // AArch64 doesn't have MUL.2d:
  981. setOperationAction(ISD::MUL, MVT::v2i64, Expand);
  982. // Custom handling for some quad-vector types to detect MULL.
  983. setOperationAction(ISD::MUL, MVT::v8i16, Custom);
  984. setOperationAction(ISD::MUL, MVT::v4i32, Custom);
  985. setOperationAction(ISD::MUL, MVT::v2i64, Custom);
  986. // Saturates
  987. for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
  988. MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
  989. setOperationAction(ISD::SADDSAT, VT, Legal);
  990. setOperationAction(ISD::UADDSAT, VT, Legal);
  991. setOperationAction(ISD::SSUBSAT, VT, Legal);
  992. setOperationAction(ISD::USUBSAT, VT, Legal);
  993. }
  994. for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
  995. MVT::v4i32}) {
  996. setOperationAction(ISD::AVGFLOORS, VT, Legal);
  997. setOperationAction(ISD::AVGFLOORU, VT, Legal);
  998. setOperationAction(ISD::AVGCEILS, VT, Legal);
  999. setOperationAction(ISD::AVGCEILU, VT, Legal);
  1000. setOperationAction(ISD::ABDS, VT, Legal);
  1001. setOperationAction(ISD::ABDU, VT, Legal);
  1002. }
  1003. // Vector reductions
  1004. for (MVT VT : { MVT::v4f16, MVT::v2f32,
  1005. MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
  1006. if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
  1007. setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
  1008. setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
  1009. setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
  1010. }
  1011. }
  1012. for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
  1013. MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
  1014. setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
  1015. setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
  1016. setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
  1017. setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
  1018. setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
  1019. }
  1020. setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom);
  1021. setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
  1022. setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
  1023. // Likewise, narrowing and extending vector loads/stores aren't handled
  1024. // directly.
  1025. for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
  1026. setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
  1027. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
  1028. setOperationAction(ISD::MULHS, VT, Legal);
  1029. setOperationAction(ISD::MULHU, VT, Legal);
  1030. } else {
  1031. setOperationAction(ISD::MULHS, VT, Expand);
  1032. setOperationAction(ISD::MULHU, VT, Expand);
  1033. }
  1034. setOperationAction(ISD::SMUL_LOHI, VT, Expand);
  1035. setOperationAction(ISD::UMUL_LOHI, VT, Expand);
  1036. setOperationAction(ISD::BSWAP, VT, Expand);
  1037. setOperationAction(ISD::CTTZ, VT, Expand);
  1038. for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
  1039. setTruncStoreAction(VT, InnerVT, Expand);
  1040. setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
  1041. setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
  1042. setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
  1043. }
  1044. }
  1045. // AArch64 has implementations of a lot of rounding-like FP operations.
  1046. for (auto Op :
  1047. {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
  1048. ISD::FROUND, ISD::FROUNDEVEN, ISD::STRICT_FFLOOR,
  1049. ISD::STRICT_FNEARBYINT, ISD::STRICT_FCEIL, ISD::STRICT_FRINT,
  1050. ISD::STRICT_FTRUNC, ISD::STRICT_FROUND, ISD::STRICT_FROUNDEVEN}) {
  1051. for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
  1052. setOperationAction(Op, Ty, Legal);
  1053. if (Subtarget->hasFullFP16())
  1054. for (MVT Ty : {MVT::v4f16, MVT::v8f16})
  1055. setOperationAction(Op, Ty, Legal);
  1056. }
  1057. setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
  1058. setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
  1059. setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
  1060. setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
  1061. setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
  1062. setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
  1063. setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
  1064. // ADDP custom lowering
  1065. for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
  1066. setOperationAction(ISD::ADD, VT, Custom);
  1067. // FADDP custom lowering
  1068. for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
  1069. setOperationAction(ISD::FADD, VT, Custom);
  1070. }
  1071. if (Subtarget->hasSME()) {
  1072. setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
  1073. }
  1074. // FIXME: Move lowering for more nodes here if those are common between
  1075. // SVE and SME.
  1076. if (Subtarget->hasSVEorSME()) {
  1077. for (auto VT :
  1078. {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
  1079. setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
  1080. setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
  1081. }
  1082. }
  1083. if (Subtarget->hasSVE()) {
  1084. for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
  1085. setOperationAction(ISD::BITREVERSE, VT, Custom);
  1086. setOperationAction(ISD::BSWAP, VT, Custom);
  1087. setOperationAction(ISD::CTLZ, VT, Custom);
  1088. setOperationAction(ISD::CTPOP, VT, Custom);
  1089. setOperationAction(ISD::CTTZ, VT, Custom);
  1090. setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
  1091. setOperationAction(ISD::UINT_TO_FP, VT, Custom);
  1092. setOperationAction(ISD::SINT_TO_FP, VT, Custom);
  1093. setOperationAction(ISD::FP_TO_UINT, VT, Custom);
  1094. setOperationAction(ISD::FP_TO_SINT, VT, Custom);
  1095. setOperationAction(ISD::MGATHER, VT, Custom);
  1096. setOperationAction(ISD::MSCATTER, VT, Custom);
  1097. setOperationAction(ISD::MLOAD, VT, Custom);
  1098. setOperationAction(ISD::MUL, VT, Custom);
  1099. setOperationAction(ISD::MULHS, VT, Custom);
  1100. setOperationAction(ISD::MULHU, VT, Custom);
  1101. setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
  1102. setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
  1103. setOperationAction(ISD::SELECT, VT, Custom);
  1104. setOperationAction(ISD::SETCC, VT, Custom);
  1105. setOperationAction(ISD::SDIV, VT, Custom);
  1106. setOperationAction(ISD::UDIV, VT, Custom);
  1107. setOperationAction(ISD::SMIN, VT, Custom);
  1108. setOperationAction(ISD::UMIN, VT, Custom);
  1109. setOperationAction(ISD::SMAX, VT, Custom);
  1110. setOperationAction(ISD::UMAX, VT, Custom);
  1111. setOperationAction(ISD::SHL, VT, Custom);
  1112. setOperationAction(ISD::SRL, VT, Custom);
  1113. setOperationAction(ISD::SRA, VT, Custom);
  1114. setOperationAction(ISD::ABS, VT, Custom);
  1115. setOperationAction(ISD::ABDS, VT, Custom);
  1116. setOperationAction(ISD::ABDU, VT, Custom);
  1117. setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
  1118. setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
  1119. setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
  1120. setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
  1121. setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
  1122. setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
  1123. setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
  1124. setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
  1125. setOperationAction(ISD::UMUL_LOHI, VT, Expand);
  1126. setOperationAction(ISD::SMUL_LOHI, VT, Expand);
  1127. setOperationAction(ISD::SELECT_CC, VT, Expand);
  1128. setOperationAction(ISD::ROTL, VT, Expand);
  1129. setOperationAction(ISD::ROTR, VT, Expand);
  1130. setOperationAction(ISD::SADDSAT, VT, Legal);
  1131. setOperationAction(ISD::UADDSAT, VT, Legal);
  1132. setOperationAction(ISD::SSUBSAT, VT, Legal);
  1133. setOperationAction(ISD::USUBSAT, VT, Legal);
  1134. setOperationAction(ISD::UREM, VT, Expand);
  1135. setOperationAction(ISD::SREM, VT, Expand);
  1136. setOperationAction(ISD::SDIVREM, VT, Expand);
  1137. setOperationAction(ISD::UDIVREM, VT, Expand);
  1138. if (Subtarget->hasSVE2()) {
  1139. setOperationAction(ISD::AVGFLOORS, VT, Custom);
  1140. setOperationAction(ISD::AVGFLOORU, VT, Custom);
  1141. setOperationAction(ISD::AVGCEILS, VT, Custom);
  1142. setOperationAction(ISD::AVGCEILU, VT, Custom);
  1143. }
  1144. }
  1145. // Illegal unpacked integer vector types.
  1146. for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
  1147. setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
  1148. setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
  1149. }
  1150. // Legalize unpacked bitcasts to REINTERPRET_CAST.
  1151. for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
  1152. MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
  1153. setOperationAction(ISD::BITCAST, VT, Custom);
  1154. for (auto VT :
  1155. { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
  1156. MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
  1157. setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
  1158. for (auto VT :
  1159. {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
  1160. setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
  1161. setOperationAction(ISD::SELECT, VT, Custom);
  1162. setOperationAction(ISD::SETCC, VT, Custom);
  1163. setOperationAction(ISD::TRUNCATE, VT, Custom);
  1164. setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
  1165. setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
  1166. setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
  1167. setOperationAction(ISD::SELECT_CC, VT, Expand);
  1168. setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
  1169. setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
  1170. // There are no legal MVT::nxv16f## based types.
  1171. if (VT != MVT::nxv16i1) {
  1172. setOperationAction(ISD::SINT_TO_FP, VT, Custom);
  1173. setOperationAction(ISD::UINT_TO_FP, VT, Custom);
  1174. }
  1175. }
  1176. // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
  1177. for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
  1178. MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
  1179. MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
  1180. setOperationAction(ISD::MLOAD, VT, Custom);
  1181. setOperationAction(ISD::MSTORE, VT, Custom);
  1182. setOperationAction(ISD::MGATHER, VT, Custom);
  1183. setOperationAction(ISD::MSCATTER, VT, Custom);
  1184. }
  1185. // Firstly, exclude all scalable vector extending loads/truncating stores,
  1186. // include both integer and floating scalable vector.
  1187. for (MVT VT : MVT::scalable_vector_valuetypes()) {
  1188. for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
  1189. setTruncStoreAction(VT, InnerVT, Expand);
  1190. setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
  1191. setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
  1192. setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
  1193. }
  1194. }
  1195. // Then, selectively enable those which we directly support.
  1196. setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
  1197. setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
  1198. setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
  1199. setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
  1200. setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
  1201. setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
  1202. for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
  1203. setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
  1204. setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
  1205. setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
  1206. setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
  1207. setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
  1208. setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
  1209. }
  1210. // SVE supports truncating stores of 64 and 128-bit vectors
  1211. setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
  1212. setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
  1213. setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
  1214. setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
  1215. setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
  1216. for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
  1217. MVT::nxv4f32, MVT::nxv2f64}) {
  1218. setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
  1219. setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
  1220. setOperationAction(ISD::MGATHER, VT, Custom);
  1221. setOperationAction(ISD::MSCATTER, VT, Custom);
  1222. setOperationAction(ISD::MLOAD, VT, Custom);
  1223. setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
  1224. setOperationAction(ISD::SELECT, VT, Custom);
  1225. setOperationAction(ISD::FADD, VT, Custom);
  1226. setOperationAction(ISD::FCOPYSIGN, VT, Custom);
  1227. setOperationAction(ISD::FDIV, VT, Custom);
  1228. setOperationAction(ISD::FMA, VT, Custom);
  1229. setOperationAction(ISD::FMAXIMUM, VT, Custom);
  1230. setOperationAction(ISD::FMAXNUM, VT, Custom);
  1231. setOperationAction(ISD::FMINIMUM, VT, Custom);
  1232. setOperationAction(ISD::FMINNUM, VT, Custom);
  1233. setOperationAction(ISD::FMUL, VT, Custom);
  1234. setOperationAction(ISD::FNEG, VT, Custom);
  1235. setOperationAction(ISD::FSUB, VT, Custom);
  1236. setOperationAction(ISD::FCEIL, VT, Custom);
  1237. setOperationAction(ISD::FFLOOR, VT, Custom);
  1238. setOperationAction(ISD::FNEARBYINT, VT, Custom);
  1239. setOperationAction(ISD::FRINT, VT, Custom);
  1240. setOperationAction(ISD::FROUND, VT, Custom);
  1241. setOperationAction(ISD::FROUNDEVEN, VT, Custom);
  1242. setOperationAction(ISD::FTRUNC, VT, Custom);
  1243. setOperationAction(ISD::FSQRT, VT, Custom);
  1244. setOperationAction(ISD::FABS, VT, Custom);
  1245. setOperationAction(ISD::FP_EXTEND, VT, Custom);
  1246. setOperationAction(ISD::FP_ROUND, VT, Custom);
  1247. setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
  1248. setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
  1249. setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
  1250. setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
  1251. setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
  1252. setOperationAction(ISD::SELECT_CC, VT, Expand);
  1253. setOperationAction(ISD::FREM, VT, Expand);
  1254. setOperationAction(ISD::FPOW, VT, Expand);
  1255. setOperationAction(ISD::FPOWI, VT, Expand);
  1256. setOperationAction(ISD::FCOS, VT, Expand);
  1257. setOperationAction(ISD::FSIN, VT, Expand);
  1258. setOperationAction(ISD::FSINCOS, VT, Expand);
  1259. setOperationAction(ISD::FEXP, VT, Expand);
  1260. setOperationAction(ISD::FEXP2, VT, Expand);
  1261. setOperationAction(ISD::FLOG, VT, Expand);
  1262. setOperationAction(ISD::FLOG2, VT, Expand);
  1263. setOperationAction(ISD::FLOG10, VT, Expand);
  1264. setCondCodeAction(ISD::SETO, VT, Expand);
  1265. setCondCodeAction(ISD::SETOLT, VT, Expand);
  1266. setCondCodeAction(ISD::SETLT, VT, Expand);
  1267. setCondCodeAction(ISD::SETOLE, VT, Expand);
  1268. setCondCodeAction(ISD::SETLE, VT, Expand);
  1269. setCondCodeAction(ISD::SETULT, VT, Expand);
  1270. setCondCodeAction(ISD::SETULE, VT, Expand);
  1271. setCondCodeAction(ISD::SETUGE, VT, Expand);
  1272. setCondCodeAction(ISD::SETUGT, VT, Expand);
  1273. setCondCodeAction(ISD::SETUEQ, VT, Expand);
  1274. setCondCodeAction(ISD::SETONE, VT, Expand);
  1275. }
  1276. for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
  1277. setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
  1278. setOperationAction(ISD::MGATHER, VT, Custom);
  1279. setOperationAction(ISD::MSCATTER, VT, Custom);
  1280. setOperationAction(ISD::MLOAD, VT, Custom);
  1281. setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
  1282. setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
  1283. }
  1284. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
  1285. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
  1286. // NEON doesn't support integer divides, but SVE does
  1287. for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
  1288. MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
  1289. setOperationAction(ISD::SDIV, VT, Custom);
  1290. setOperationAction(ISD::UDIV, VT, Custom);
  1291. }
  1292. // NEON doesn't support 64-bit vector integer muls, but SVE does.
  1293. setOperationAction(ISD::MUL, MVT::v1i64, Custom);
  1294. setOperationAction(ISD::MUL, MVT::v2i64, Custom);
  1295. // NEON doesn't support across-vector reductions, but SVE does.
  1296. for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
  1297. setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
  1298. if (Subtarget->forceStreamingCompatibleSVE()) {
  1299. setTruncStoreAction(MVT::v2f32, MVT::v2f16, Custom);
  1300. setTruncStoreAction(MVT::v4f32, MVT::v4f16, Custom);
  1301. setTruncStoreAction(MVT::v8f32, MVT::v8f16, Custom);
  1302. setTruncStoreAction(MVT::v1f64, MVT::v1f16, Custom);
  1303. setTruncStoreAction(MVT::v2f64, MVT::v2f16, Custom);
  1304. setTruncStoreAction(MVT::v4f64, MVT::v4f16, Custom);
  1305. setTruncStoreAction(MVT::v1f64, MVT::v1f32, Custom);
  1306. setTruncStoreAction(MVT::v2f64, MVT::v2f32, Custom);
  1307. setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom);
  1308. for (MVT VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
  1309. MVT::v4i32, MVT::v1i64, MVT::v2i64})
  1310. addTypeForStreamingSVE(VT);
  1311. for (MVT VT :
  1312. {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
  1313. addTypeForStreamingSVE(VT);
  1314. }
  1315. // NOTE: Currently this has to happen after computeRegisterProperties rather
  1316. // than the preferred option of combining it with the addRegisterClass call.
  1317. if (Subtarget->useSVEForFixedLengthVectors()) {
  1318. for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
  1319. if (useSVEForFixedLengthVectorVT(VT))
  1320. addTypeForFixedLengthSVE(VT);
  1321. for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
  1322. if (useSVEForFixedLengthVectorVT(VT))
  1323. addTypeForFixedLengthSVE(VT);
  1324. // 64bit results can mean a bigger than NEON input.
  1325. for (auto VT : {MVT::v8i8, MVT::v4i16})
  1326. setOperationAction(ISD::TRUNCATE, VT, Custom);
  1327. setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
  1328. // 128bit results imply a bigger than NEON input.
  1329. for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
  1330. setOperationAction(ISD::TRUNCATE, VT, Custom);
  1331. for (auto VT : {MVT::v8f16, MVT::v4f32})
  1332. setOperationAction(ISD::FP_ROUND, VT, Custom);
  1333. // These operations are not supported on NEON but SVE can do them.
  1334. setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
  1335. setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
  1336. setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
  1337. setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
  1338. setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
  1339. setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
  1340. setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
  1341. setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
  1342. setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
  1343. setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
  1344. setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
  1345. setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
  1346. setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
  1347. setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
  1348. setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
  1349. setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
  1350. setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom);
  1351. setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom);
  1352. setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom);
  1353. setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom);
  1354. // Int operations with no NEON support.
  1355. for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
  1356. MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
  1357. setOperationAction(ISD::BITREVERSE, VT, Custom);
  1358. setOperationAction(ISD::CTTZ, VT, Custom);
  1359. setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
  1360. setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
  1361. setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
  1362. }
  1363. // Use SVE for vectors with more than 2 elements.
  1364. for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
  1365. setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
  1366. }
  1367. setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
  1368. setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
  1369. setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
  1370. setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
  1371. setOperationAction(ISD::VSCALE, MVT::i32, Custom);
  1372. }
  1373. if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
  1374. // Only required for llvm.aarch64.mops.memset.tag
  1375. setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
  1376. }
  1377. setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  1378. PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
  1379. IsStrictFPEnabled = true;
  1380. }
  1381. void AArch64TargetLowering::addTypeForNEON(MVT VT) {
  1382. assert(VT.isVector() && "VT should be a vector type");
  1383. if (VT.isFloatingPoint()) {
  1384. MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
  1385. setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
  1386. setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
  1387. }
  1388. // Mark vector float intrinsics as expand.
  1389. if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
  1390. setOperationAction(ISD::FSIN, VT, Expand);
  1391. setOperationAction(ISD::FCOS, VT, Expand);
  1392. setOperationAction(ISD::FPOW, VT, Expand);
  1393. setOperationAction(ISD::FLOG, VT, Expand);
  1394. setOperationAction(ISD::FLOG2, VT, Expand);
  1395. setOperationAction(ISD::FLOG10, VT, Expand);
  1396. setOperationAction(ISD::FEXP, VT, Expand);
  1397. setOperationAction(ISD::FEXP2, VT, Expand);
  1398. }
  1399. // But we do support custom-lowering for FCOPYSIGN.
  1400. if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
  1401. ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16()))
  1402. setOperationAction(ISD::FCOPYSIGN, VT, Custom);
  1403. setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
  1404. setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
  1405. setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
  1406. setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
  1407. setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
  1408. setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
  1409. setOperationAction(ISD::SRA, VT, Custom);
  1410. setOperationAction(ISD::SRL, VT, Custom);
  1411. setOperationAction(ISD::SHL, VT, Custom);
  1412. setOperationAction(ISD::OR, VT, Custom);
  1413. setOperationAction(ISD::SETCC, VT, Custom);
  1414. setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
  1415. setOperationAction(ISD::SELECT, VT, Expand);
  1416. setOperationAction(ISD::SELECT_CC, VT, Expand);
  1417. setOperationAction(ISD::VSELECT, VT, Expand);
  1418. for (MVT InnerVT : MVT::all_valuetypes())
  1419. setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
  1420. // CNT supports only B element sizes, then use UADDLP to widen.
  1421. if (VT != MVT::v8i8 && VT != MVT::v16i8)
  1422. setOperationAction(ISD::CTPOP, VT, Custom);
  1423. setOperationAction(ISD::UDIV, VT, Expand);
  1424. setOperationAction(ISD::SDIV, VT, Expand);
  1425. setOperationAction(ISD::UREM, VT, Expand);
  1426. setOperationAction(ISD::SREM, VT, Expand);
  1427. setOperationAction(ISD::FREM, VT, Expand);
  1428. for (unsigned Opcode :
  1429. {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
  1430. ISD::FP_TO_UINT_SAT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
  1431. setOperationAction(Opcode, VT, Custom);
  1432. if (!VT.isFloatingPoint())
  1433. setOperationAction(ISD::ABS, VT, Legal);
  1434. // [SU][MIN|MAX] are available for all NEON types apart from i64.
  1435. if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
  1436. for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
  1437. setOperationAction(Opcode, VT, Legal);
  1438. // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
  1439. // NEON types.
  1440. if (VT.isFloatingPoint() &&
  1441. VT.getVectorElementType() != MVT::bf16 &&
  1442. (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
  1443. for (unsigned Opcode :
  1444. {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
  1445. ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_FMINNUM,
  1446. ISD::STRICT_FMAXNUM, ISD::STRICT_FADD, ISD::STRICT_FSUB,
  1447. ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FMA,
  1448. ISD::STRICT_FSQRT})
  1449. setOperationAction(Opcode, VT, Legal);
  1450. // Strict fp extend and trunc are legal
  1451. if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
  1452. setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
  1453. if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
  1454. setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal);
  1455. // FIXME: We could potentially make use of the vector comparison instructions
  1456. // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
  1457. // complications:
  1458. // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
  1459. // so we would need to expand when the condition code doesn't match the
  1460. // kind of comparison.
  1461. // * Some kinds of comparison require more than one FCMXY instruction so
  1462. // would need to be expanded instead.
  1463. // * The lowering of the non-strict versions involves target-specific ISD
  1464. // nodes so we would likely need to add strict versions of all of them and
  1465. // handle them appropriately.
  1466. setOperationAction(ISD::STRICT_FSETCC, VT, Expand);
  1467. setOperationAction(ISD::STRICT_FSETCCS, VT, Expand);
  1468. if (Subtarget->isLittleEndian()) {
  1469. for (unsigned im = (unsigned)ISD::PRE_INC;
  1470. im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
  1471. setIndexedLoadAction(im, VT, Legal);
  1472. setIndexedStoreAction(im, VT, Legal);
  1473. }
  1474. }
  1475. if (Subtarget->hasD128()) {
  1476. setOperationAction(ISD::READ_REGISTER, MVT::i128, Custom);
  1477. setOperationAction(ISD::WRITE_REGISTER, MVT::i128, Custom);
  1478. }
  1479. }
  1480. bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
  1481. EVT OpVT) const {
  1482. // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
  1483. if (!Subtarget->hasSVE())
  1484. return true;
  1485. // We can only support legal predicate result types. We can use the SVE
  1486. // whilelo instruction for generating fixed-width predicates too.
  1487. if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
  1488. ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
  1489. ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
  1490. return true;
  1491. // The whilelo instruction only works with i32 or i64 scalar inputs.
  1492. if (OpVT != MVT::i32 && OpVT != MVT::i64)
  1493. return true;
  1494. return false;
  1495. }
  1496. void AArch64TargetLowering::addTypeForStreamingSVE(MVT VT) {
  1497. // By default set all operations to Expand,
  1498. // then change to Legal/Custom if needed.
  1499. for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
  1500. setOperationAction(Op, VT, Expand);
  1501. assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
  1502. if (VT.isFloatingPoint()) {
  1503. setCondCodeAction(ISD::SETO, VT, Expand);
  1504. setCondCodeAction(ISD::SETOLT, VT, Expand);
  1505. setCondCodeAction(ISD::SETOLE, VT, Expand);
  1506. setCondCodeAction(ISD::SETULT, VT, Expand);
  1507. setCondCodeAction(ISD::SETULE, VT, Expand);
  1508. setCondCodeAction(ISD::SETUGE, VT, Expand);
  1509. setCondCodeAction(ISD::SETUGT, VT, Expand);
  1510. setCondCodeAction(ISD::SETUEQ, VT, Expand);
  1511. setCondCodeAction(ISD::SETONE, VT, Expand);
  1512. }
  1513. // STORE, LOAD, SCALAR_TO_VECTOR and BITCAST are natively supported,
  1514. // so no need to Custom/Expand them.
  1515. setOperationAction(ISD::STORE, VT, Legal);
  1516. setOperationAction(ISD::LOAD, VT, Legal);
  1517. setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
  1518. setOperationAction(ISD::BITCAST, VT, Legal);
  1519. // Mark integer truncating stores/extending loads as having custom lowering
  1520. if (VT.isInteger()) {
  1521. MVT InnerVT = VT.changeVectorElementType(MVT::i8);
  1522. while (InnerVT != VT) {
  1523. setTruncStoreAction(VT, InnerVT, Custom);
  1524. setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
  1525. setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
  1526. InnerVT = InnerVT.changeVectorElementType(
  1527. MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
  1528. }
  1529. }
  1530. // Mark floating-point truncating stores/extending loads as having custom
  1531. // lowering
  1532. if (VT.isFloatingPoint()) {
  1533. MVT InnerVT = VT.changeVectorElementType(MVT::f16);
  1534. while (InnerVT != VT) {
  1535. setTruncStoreAction(VT, InnerVT, Custom);
  1536. setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
  1537. InnerVT = InnerVT.changeVectorElementType(
  1538. MVT::getFloatingPointVT(2 * InnerVT.getScalarSizeInBits()));
  1539. }
  1540. }
  1541. setOperationAction(ISD::ABS, VT, Custom);
  1542. setOperationAction(ISD::ADD, VT, Custom);
  1543. setOperationAction(ISD::AND, VT, Custom);
  1544. setOperationAction(ISD::ANY_EXTEND, VT, Custom);
  1545. setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
  1546. setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
  1547. setOperationAction(ISD::CTLZ, VT, Custom);
  1548. setOperationAction(ISD::CTPOP, VT, Custom);
  1549. setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
  1550. setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
  1551. setOperationAction(ISD::FABS, VT, Custom);
  1552. setOperationAction(ISD::FADD, VT, Custom);
  1553. setOperationAction(ISD::FCEIL, VT, Custom);
  1554. setOperationAction(ISD::FCOPYSIGN, VT, Custom);
  1555. setOperationAction(ISD::FDIV, VT, Custom);
  1556. setOperationAction(ISD::FFLOOR, VT, Custom);
  1557. setOperationAction(ISD::FMA, VT, Custom);
  1558. setOperationAction(ISD::FMAXIMUM, VT, Custom);
  1559. setOperationAction(ISD::FMAXNUM, VT, Custom);
  1560. setOperationAction(ISD::FMINIMUM, VT, Custom);
  1561. setOperationAction(ISD::FMINNUM, VT, Custom);
  1562. setOperationAction(ISD::FMUL, VT, Custom);
  1563. setOperationAction(ISD::FNEARBYINT, VT, Custom);
  1564. setOperationAction(ISD::FNEG, VT, Custom);
  1565. setOperationAction(ISD::FP_ROUND, VT, Custom);
  1566. setOperationAction(ISD::FP_TO_SINT, VT, Custom);
  1567. setOperationAction(ISD::FP_TO_UINT, VT, Custom);
  1568. setOperationAction(ISD::FRINT, VT, Custom);
  1569. setOperationAction(ISD::FROUND, VT, Custom);
  1570. setOperationAction(ISD::FROUNDEVEN, VT, Custom);
  1571. setOperationAction(ISD::FSQRT, VT, Custom);
  1572. setOperationAction(ISD::FSUB, VT, Custom);
  1573. setOperationAction(ISD::FTRUNC, VT, Custom);
  1574. setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
  1575. setOperationAction(ISD::MLOAD, VT, Custom);
  1576. setOperationAction(ISD::MSTORE, VT, Custom);
  1577. setOperationAction(ISD::MUL, VT, Custom);
  1578. setOperationAction(ISD::MULHS, VT, Custom);
  1579. setOperationAction(ISD::MULHU, VT, Custom);
  1580. setOperationAction(ISD::OR, VT, Custom);
  1581. setOperationAction(ISD::SDIV, VT, Custom);
  1582. setOperationAction(ISD::SETCC, VT, Custom);
  1583. setOperationAction(ISD::SHL, VT, Custom);
  1584. setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
  1585. setOperationAction(ISD::SINT_TO_FP, VT, Custom);
  1586. setOperationAction(ISD::SMAX, VT, Custom);
  1587. setOperationAction(ISD::SMIN, VT, Custom);
  1588. setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
  1589. setOperationAction(ISD::SRA, VT, Custom);
  1590. setOperationAction(ISD::SRL, VT, Custom);
  1591. setOperationAction(ISD::SUB, VT, Custom);
  1592. setOperationAction(ISD::TRUNCATE, VT, Custom);
  1593. setOperationAction(ISD::UDIV, VT, Custom);
  1594. setOperationAction(ISD::UINT_TO_FP, VT, Custom);
  1595. setOperationAction(ISD::UMAX, VT, Custom);
  1596. setOperationAction(ISD::UMIN, VT, Custom);
  1597. setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
  1598. setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
  1599. setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
  1600. setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
  1601. setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
  1602. setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
  1603. setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
  1604. setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
  1605. setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
  1606. setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
  1607. setOperationAction(ISD::XOR, VT, Custom);
  1608. setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
  1609. }
  1610. void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
  1611. assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
  1612. // By default everything must be expanded.
  1613. for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
  1614. setOperationAction(Op, VT, Expand);
  1615. if (VT.isFloatingPoint()) {
  1616. setCondCodeAction(ISD::SETO, VT, Expand);
  1617. setCondCodeAction(ISD::SETOLT, VT, Expand);
  1618. setCondCodeAction(ISD::SETOLE, VT, Expand);
  1619. setCondCodeAction(ISD::SETULT, VT, Expand);
  1620. setCondCodeAction(ISD::SETULE, VT, Expand);
  1621. setCondCodeAction(ISD::SETUGE, VT, Expand);
  1622. setCondCodeAction(ISD::SETUGT, VT, Expand);
  1623. setCondCodeAction(ISD::SETUEQ, VT, Expand);
  1624. setCondCodeAction(ISD::SETONE, VT, Expand);
  1625. }
  1626. // Mark integer truncating stores/extending loads as having custom lowering
  1627. if (VT.isInteger()) {
  1628. MVT InnerVT = VT.changeVectorElementType(MVT::i8);
  1629. while (InnerVT != VT) {
  1630. setTruncStoreAction(VT, InnerVT, Custom);
  1631. setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
  1632. setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
  1633. InnerVT = InnerVT.changeVectorElementType(
  1634. MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
  1635. }
  1636. }
  1637. // Mark floating-point truncating stores/extending loads as having custom
  1638. // lowering
  1639. if (VT.isFloatingPoint()) {
  1640. MVT InnerVT = VT.changeVectorElementType(MVT::f16);
  1641. while (InnerVT != VT) {
  1642. setTruncStoreAction(VT, InnerVT, Custom);
  1643. setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
  1644. InnerVT = InnerVT.changeVectorElementType(
  1645. MVT::getFloatingPointVT(2 * InnerVT.getScalarSizeInBits()));
  1646. }
  1647. }
  1648. // Lower fixed length vector operations to scalable equivalents.
  1649. setOperationAction(ISD::ABS, VT, Custom);
  1650. setOperationAction(ISD::ADD, VT, Custom);
  1651. setOperationAction(ISD::AND, VT, Custom);
  1652. setOperationAction(ISD::ANY_EXTEND, VT, Custom);
  1653. setOperationAction(ISD::BITCAST, VT, Custom);
  1654. setOperationAction(ISD::BITREVERSE, VT, Custom);
  1655. setOperationAction(ISD::BSWAP, VT, Custom);
  1656. setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
  1657. setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
  1658. setOperationAction(ISD::CTLZ, VT, Custom);
  1659. setOperationAction(ISD::CTPOP, VT, Custom);
  1660. setOperationAction(ISD::CTTZ, VT, Custom);
  1661. setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
  1662. setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
  1663. setOperationAction(ISD::FABS, VT, Custom);
  1664. setOperationAction(ISD::FADD, VT, Custom);
  1665. setOperationAction(ISD::FCEIL, VT, Custom);
  1666. setOperationAction(ISD::FCOPYSIGN, VT, Custom);
  1667. setOperationAction(ISD::FDIV, VT, Custom);
  1668. setOperationAction(ISD::FFLOOR, VT, Custom);
  1669. setOperationAction(ISD::FMA, VT, Custom);
  1670. setOperationAction(ISD::FMAXIMUM, VT, Custom);
  1671. setOperationAction(ISD::FMAXNUM, VT, Custom);
  1672. setOperationAction(ISD::FMINIMUM, VT, Custom);
  1673. setOperationAction(ISD::FMINNUM, VT, Custom);
  1674. setOperationAction(ISD::FMUL, VT, Custom);
  1675. setOperationAction(ISD::FNEARBYINT, VT, Custom);
  1676. setOperationAction(ISD::FNEG, VT, Custom);
  1677. setOperationAction(ISD::FP_EXTEND, VT, Custom);
  1678. setOperationAction(ISD::FP_ROUND, VT, Custom);
  1679. setOperationAction(ISD::FP_TO_SINT, VT, Custom);
  1680. setOperationAction(ISD::FP_TO_UINT, VT, Custom);
  1681. setOperationAction(ISD::FRINT, VT, Custom);
  1682. setOperationAction(ISD::FROUND, VT, Custom);
  1683. setOperationAction(ISD::FROUNDEVEN, VT, Custom);
  1684. setOperationAction(ISD::FSQRT, VT, Custom);
  1685. setOperationAction(ISD::FSUB, VT, Custom);
  1686. setOperationAction(ISD::FTRUNC, VT, Custom);
  1687. setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
  1688. setOperationAction(ISD::LOAD, VT, Custom);
  1689. setOperationAction(ISD::MGATHER, VT, Custom);
  1690. setOperationAction(ISD::MLOAD, VT, Custom);
  1691. setOperationAction(ISD::MSCATTER, VT, Custom);
  1692. setOperationAction(ISD::MSTORE, VT, Custom);
  1693. setOperationAction(ISD::MUL, VT, Custom);
  1694. setOperationAction(ISD::MULHS, VT, Custom);
  1695. setOperationAction(ISD::MULHU, VT, Custom);
  1696. setOperationAction(ISD::OR, VT, Custom);
  1697. setOperationAction(ISD::SDIV, VT, Custom);
  1698. setOperationAction(ISD::SELECT, VT, Custom);
  1699. setOperationAction(ISD::SETCC, VT, Custom);
  1700. setOperationAction(ISD::SHL, VT, Custom);
  1701. setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
  1702. setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
  1703. setOperationAction(ISD::SINT_TO_FP, VT, Custom);
  1704. setOperationAction(ISD::SMAX, VT, Custom);
  1705. setOperationAction(ISD::SMIN, VT, Custom);
  1706. setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
  1707. setOperationAction(ISD::SRA, VT, Custom);
  1708. setOperationAction(ISD::SRL, VT, Custom);
  1709. setOperationAction(ISD::STORE, VT, Custom);
  1710. setOperationAction(ISD::SUB, VT, Custom);
  1711. setOperationAction(ISD::TRUNCATE, VT, Custom);
  1712. setOperationAction(ISD::UDIV, VT, Custom);
  1713. setOperationAction(ISD::UINT_TO_FP, VT, Custom);
  1714. setOperationAction(ISD::UMAX, VT, Custom);
  1715. setOperationAction(ISD::UMIN, VT, Custom);
  1716. setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
  1717. setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
  1718. setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
  1719. setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
  1720. setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
  1721. setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
  1722. setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
  1723. setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
  1724. setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
  1725. setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
  1726. setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
  1727. setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
  1728. setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
  1729. setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
  1730. setOperationAction(ISD::VSELECT, VT, Custom);
  1731. setOperationAction(ISD::XOR, VT, Custom);
  1732. setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
  1733. }
  1734. void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
  1735. addRegisterClass(VT, &AArch64::FPR64RegClass);
  1736. addTypeForNEON(VT);
  1737. }
  1738. void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
  1739. addRegisterClass(VT, &AArch64::FPR128RegClass);
  1740. addTypeForNEON(VT);
  1741. }
  1742. EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
  1743. LLVMContext &C, EVT VT) const {
  1744. if (!VT.isVector())
  1745. return MVT::i32;
  1746. if (VT.isScalableVector())
  1747. return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
  1748. return VT.changeVectorElementTypeToInteger();
  1749. }
  1750. // isIntImmediate - This method tests to see if the node is a constant
  1751. // operand. If so Imm will receive the value.
  1752. static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
  1753. if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
  1754. Imm = C->getZExtValue();
  1755. return true;
  1756. }
  1757. return false;
  1758. }
  1759. // isOpcWithIntImmediate - This method tests to see if the node is a specific
  1760. // opcode and that it has a immediate integer right operand.
  1761. // If so Imm will receive the value.
  1762. static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
  1763. uint64_t &Imm) {
  1764. return N->getOpcode() == Opc &&
  1765. isIntImmediate(N->getOperand(1).getNode(), Imm);
  1766. }
  1767. static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
  1768. const APInt &Demanded,
  1769. TargetLowering::TargetLoweringOpt &TLO,
  1770. unsigned NewOpc) {
  1771. uint64_t OldImm = Imm, NewImm, Enc;
  1772. uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
  1773. // Return if the immediate is already all zeros, all ones, a bimm32 or a
  1774. // bimm64.
  1775. if (Imm == 0 || Imm == Mask ||
  1776. AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
  1777. return false;
  1778. unsigned EltSize = Size;
  1779. uint64_t DemandedBits = Demanded.getZExtValue();
  1780. // Clear bits that are not demanded.
  1781. Imm &= DemandedBits;
  1782. while (true) {
  1783. // The goal here is to set the non-demanded bits in a way that minimizes
  1784. // the number of switching between 0 and 1. In order to achieve this goal,
  1785. // we set the non-demanded bits to the value of the preceding demanded bits.
  1786. // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
  1787. // non-demanded bit), we copy bit0 (1) to the least significant 'x',
  1788. // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
  1789. // The final result is 0b11000011.
  1790. uint64_t NonDemandedBits = ~DemandedBits;
  1791. uint64_t InvertedImm = ~Imm & DemandedBits;
  1792. uint64_t RotatedImm =
  1793. ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
  1794. NonDemandedBits;
  1795. uint64_t Sum = RotatedImm + NonDemandedBits;
  1796. bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
  1797. uint64_t Ones = (Sum + Carry) & NonDemandedBits;
  1798. NewImm = (Imm | Ones) & Mask;
  1799. // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
  1800. // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
  1801. // we halve the element size and continue the search.
  1802. if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
  1803. break;
  1804. // We cannot shrink the element size any further if it is 2-bits.
  1805. if (EltSize == 2)
  1806. return false;
  1807. EltSize /= 2;
  1808. Mask >>= EltSize;
  1809. uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
  1810. // Return if there is mismatch in any of the demanded bits of Imm and Hi.
  1811. if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
  1812. return false;
  1813. // Merge the upper and lower halves of Imm and DemandedBits.
  1814. Imm |= Hi;
  1815. DemandedBits |= DemandedBitsHi;
  1816. }
  1817. ++NumOptimizedImms;
  1818. // Replicate the element across the register width.
  1819. while (EltSize < Size) {
  1820. NewImm |= NewImm << EltSize;
  1821. EltSize *= 2;
  1822. }
  1823. (void)OldImm;
  1824. assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
  1825. "demanded bits should never be altered");
  1826. assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
  1827. // Create the new constant immediate node.
  1828. EVT VT = Op.getValueType();
  1829. SDLoc DL(Op);
  1830. SDValue New;
  1831. // If the new constant immediate is all-zeros or all-ones, let the target
  1832. // independent DAG combine optimize this node.
  1833. if (NewImm == 0 || NewImm == OrigMask) {
  1834. New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
  1835. TLO.DAG.getConstant(NewImm, DL, VT));
  1836. // Otherwise, create a machine node so that target independent DAG combine
  1837. // doesn't undo this optimization.
  1838. } else {
  1839. Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
  1840. SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
  1841. New = SDValue(
  1842. TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
  1843. }
  1844. return TLO.CombineTo(Op, New);
  1845. }
  1846. bool AArch64TargetLowering::targetShrinkDemandedConstant(
  1847. SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
  1848. TargetLoweringOpt &TLO) const {
  1849. // Delay this optimization to as late as possible.
  1850. if (!TLO.LegalOps)
  1851. return false;
  1852. if (!EnableOptimizeLogicalImm)
  1853. return false;
  1854. EVT VT = Op.getValueType();
  1855. if (VT.isVector())
  1856. return false;
  1857. unsigned Size = VT.getSizeInBits();
  1858. assert((Size == 32 || Size == 64) &&
  1859. "i32 or i64 is expected after legalization.");
  1860. // Exit early if we demand all bits.
  1861. if (DemandedBits.countPopulation() == Size)
  1862. return false;
  1863. unsigned NewOpc;
  1864. switch (Op.getOpcode()) {
  1865. default:
  1866. return false;
  1867. case ISD::AND:
  1868. NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
  1869. break;
  1870. case ISD::OR:
  1871. NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
  1872. break;
  1873. case ISD::XOR:
  1874. NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
  1875. break;
  1876. }
  1877. ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
  1878. if (!C)
  1879. return false;
  1880. uint64_t Imm = C->getZExtValue();
  1881. return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
  1882. }
  1883. /// computeKnownBitsForTargetNode - Determine which of the bits specified in
  1884. /// Mask are known to be either zero or one and return them Known.
  1885. void AArch64TargetLowering::computeKnownBitsForTargetNode(
  1886. const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
  1887. const SelectionDAG &DAG, unsigned Depth) const {
  1888. switch (Op.getOpcode()) {
  1889. default:
  1890. break;
  1891. case AArch64ISD::DUP: {
  1892. SDValue SrcOp = Op.getOperand(0);
  1893. Known = DAG.computeKnownBits(SrcOp, Depth + 1);
  1894. if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
  1895. assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
  1896. "Expected DUP implicit truncation");
  1897. Known = Known.trunc(Op.getScalarValueSizeInBits());
  1898. }
  1899. break;
  1900. }
  1901. case AArch64ISD::CSEL: {
  1902. KnownBits Known2;
  1903. Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
  1904. Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
  1905. Known = KnownBits::commonBits(Known, Known2);
  1906. break;
  1907. }
  1908. case AArch64ISD::BICi: {
  1909. // Compute the bit cleared value.
  1910. uint64_t Mask =
  1911. ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
  1912. Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
  1913. Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
  1914. break;
  1915. }
  1916. case AArch64ISD::VLSHR: {
  1917. KnownBits Known2;
  1918. Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
  1919. Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
  1920. Known = KnownBits::lshr(Known, Known2);
  1921. break;
  1922. }
  1923. case AArch64ISD::VASHR: {
  1924. KnownBits Known2;
  1925. Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
  1926. Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
  1927. Known = KnownBits::ashr(Known, Known2);
  1928. break;
  1929. }
  1930. case AArch64ISD::MOVI: {
  1931. ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(0));
  1932. Known =
  1933. KnownBits::makeConstant(APInt(Known.getBitWidth(), CN->getZExtValue()));
  1934. break;
  1935. }
  1936. case AArch64ISD::LOADgot:
  1937. case AArch64ISD::ADDlow: {
  1938. if (!Subtarget->isTargetILP32())
  1939. break;
  1940. // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
  1941. Known.Zero = APInt::getHighBitsSet(64, 32);
  1942. break;
  1943. }
  1944. case AArch64ISD::ASSERT_ZEXT_BOOL: {
  1945. Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
  1946. Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
  1947. break;
  1948. }
  1949. case ISD::INTRINSIC_W_CHAIN: {
  1950. ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
  1951. Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
  1952. switch (IntID) {
  1953. default: return;
  1954. case Intrinsic::aarch64_ldaxr:
  1955. case Intrinsic::aarch64_ldxr: {
  1956. unsigned BitWidth = Known.getBitWidth();
  1957. EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
  1958. unsigned MemBits = VT.getScalarSizeInBits();
  1959. Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
  1960. return;
  1961. }
  1962. }
  1963. break;
  1964. }
  1965. case ISD::INTRINSIC_WO_CHAIN:
  1966. case ISD::INTRINSIC_VOID: {
  1967. unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
  1968. switch (IntNo) {
  1969. default:
  1970. break;
  1971. case Intrinsic::aarch64_neon_umaxv:
  1972. case Intrinsic::aarch64_neon_uminv: {
  1973. // Figure out the datatype of the vector operand. The UMINV instruction
  1974. // will zero extend the result, so we can mark as known zero all the
  1975. // bits larger than the element datatype. 32-bit or larget doesn't need
  1976. // this as those are legal types and will be handled by isel directly.
  1977. MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
  1978. unsigned BitWidth = Known.getBitWidth();
  1979. if (VT == MVT::v8i8 || VT == MVT::v16i8) {
  1980. assert(BitWidth >= 8 && "Unexpected width!");
  1981. APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
  1982. Known.Zero |= Mask;
  1983. } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
  1984. assert(BitWidth >= 16 && "Unexpected width!");
  1985. APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
  1986. Known.Zero |= Mask;
  1987. }
  1988. break;
  1989. } break;
  1990. }
  1991. }
  1992. }
  1993. }
  1994. MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
  1995. EVT) const {
  1996. return MVT::i64;
  1997. }
  1998. bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
  1999. EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
  2000. unsigned *Fast) const {
  2001. if (Subtarget->requiresStrictAlign())
  2002. return false;
  2003. if (Fast) {
  2004. // Some CPUs are fine with unaligned stores except for 128-bit ones.
  2005. *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
  2006. // See comments in performSTORECombine() for more details about
  2007. // these conditions.
  2008. // Code that uses clang vector extensions can mark that it
  2009. // wants unaligned accesses to be treated as fast by
  2010. // underspecifying alignment to be 1 or 2.
  2011. Alignment <= 2 ||
  2012. // Disregard v2i64. Memcpy lowering produces those and splitting
  2013. // them regresses performance on micro-benchmarks and olden/bh.
  2014. VT == MVT::v2i64;
  2015. }
  2016. return true;
  2017. }
  2018. // Same as above but handling LLTs instead.
  2019. bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
  2020. LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
  2021. unsigned *Fast) const {
  2022. if (Subtarget->requiresStrictAlign())
  2023. return false;
  2024. if (Fast) {
  2025. // Some CPUs are fine with unaligned stores except for 128-bit ones.
  2026. *Fast = !Subtarget->isMisaligned128StoreSlow() ||
  2027. Ty.getSizeInBytes() != 16 ||
  2028. // See comments in performSTORECombine() for more details about
  2029. // these conditions.
  2030. // Code that uses clang vector extensions can mark that it
  2031. // wants unaligned accesses to be treated as fast by
  2032. // underspecifying alignment to be 1 or 2.
  2033. Alignment <= 2 ||
  2034. // Disregard v2i64. Memcpy lowering produces those and splitting
  2035. // them regresses performance on micro-benchmarks and olden/bh.
  2036. Ty == LLT::fixed_vector(2, 64);
  2037. }
  2038. return true;
  2039. }
  2040. FastISel *
  2041. AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
  2042. const TargetLibraryInfo *libInfo) const {
  2043. return AArch64::createFastISel(funcInfo, libInfo);
  2044. }
  2045. const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
  2046. #define MAKE_CASE(V) \
  2047. case V: \
  2048. return #V;
  2049. switch ((AArch64ISD::NodeType)Opcode) {
  2050. case AArch64ISD::FIRST_NUMBER:
  2051. break;
  2052. MAKE_CASE(AArch64ISD::OBSCURE_COPY)
  2053. MAKE_CASE(AArch64ISD::SMSTART)
  2054. MAKE_CASE(AArch64ISD::SMSTOP)
  2055. MAKE_CASE(AArch64ISD::RESTORE_ZA)
  2056. MAKE_CASE(AArch64ISD::CALL)
  2057. MAKE_CASE(AArch64ISD::ADRP)
  2058. MAKE_CASE(AArch64ISD::ADR)
  2059. MAKE_CASE(AArch64ISD::ADDlow)
  2060. MAKE_CASE(AArch64ISD::LOADgot)
  2061. MAKE_CASE(AArch64ISD::RET_FLAG)
  2062. MAKE_CASE(AArch64ISD::BRCOND)
  2063. MAKE_CASE(AArch64ISD::CSEL)
  2064. MAKE_CASE(AArch64ISD::CSINV)
  2065. MAKE_CASE(AArch64ISD::CSNEG)
  2066. MAKE_CASE(AArch64ISD::CSINC)
  2067. MAKE_CASE(AArch64ISD::THREAD_POINTER)
  2068. MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
  2069. MAKE_CASE(AArch64ISD::ABDS_PRED)
  2070. MAKE_CASE(AArch64ISD::ABDU_PRED)
  2071. MAKE_CASE(AArch64ISD::HADDS_PRED)
  2072. MAKE_CASE(AArch64ISD::HADDU_PRED)
  2073. MAKE_CASE(AArch64ISD::MUL_PRED)
  2074. MAKE_CASE(AArch64ISD::MULHS_PRED)
  2075. MAKE_CASE(AArch64ISD::MULHU_PRED)
  2076. MAKE_CASE(AArch64ISD::RHADDS_PRED)
  2077. MAKE_CASE(AArch64ISD::RHADDU_PRED)
  2078. MAKE_CASE(AArch64ISD::SDIV_PRED)
  2079. MAKE_CASE(AArch64ISD::SHL_PRED)
  2080. MAKE_CASE(AArch64ISD::SMAX_PRED)
  2081. MAKE_CASE(AArch64ISD::SMIN_PRED)
  2082. MAKE_CASE(AArch64ISD::SRA_PRED)
  2083. MAKE_CASE(AArch64ISD::SRL_PRED)
  2084. MAKE_CASE(AArch64ISD::UDIV_PRED)
  2085. MAKE_CASE(AArch64ISD::UMAX_PRED)
  2086. MAKE_CASE(AArch64ISD::UMIN_PRED)
  2087. MAKE_CASE(AArch64ISD::SRAD_MERGE_OP1)
  2088. MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU)
  2089. MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU)
  2090. MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU)
  2091. MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU)
  2092. MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU)
  2093. MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU)
  2094. MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU)
  2095. MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU)
  2096. MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU)
  2097. MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU)
  2098. MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU)
  2099. MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU)
  2100. MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU)
  2101. MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU)
  2102. MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU)
  2103. MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU)
  2104. MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU)
  2105. MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU)
  2106. MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU)
  2107. MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU)
  2108. MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU)
  2109. MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO)
  2110. MAKE_CASE(AArch64ISD::ADC)
  2111. MAKE_CASE(AArch64ISD::SBC)
  2112. MAKE_CASE(AArch64ISD::ADDS)
  2113. MAKE_CASE(AArch64ISD::SUBS)
  2114. MAKE_CASE(AArch64ISD::ADCS)
  2115. MAKE_CASE(AArch64ISD::SBCS)
  2116. MAKE_CASE(AArch64ISD::ANDS)
  2117. MAKE_CASE(AArch64ISD::CCMP)
  2118. MAKE_CASE(AArch64ISD::CCMN)
  2119. MAKE_CASE(AArch64ISD::FCCMP)
  2120. MAKE_CASE(AArch64ISD::FCMP)
  2121. MAKE_CASE(AArch64ISD::STRICT_FCMP)
  2122. MAKE_CASE(AArch64ISD::STRICT_FCMPE)
  2123. MAKE_CASE(AArch64ISD::DUP)
  2124. MAKE_CASE(AArch64ISD::DUPLANE8)
  2125. MAKE_CASE(AArch64ISD::DUPLANE16)
  2126. MAKE_CASE(AArch64ISD::DUPLANE32)
  2127. MAKE_CASE(AArch64ISD::DUPLANE64)
  2128. MAKE_CASE(AArch64ISD::DUPLANE128)
  2129. MAKE_CASE(AArch64ISD::MOVI)
  2130. MAKE_CASE(AArch64ISD::MOVIshift)
  2131. MAKE_CASE(AArch64ISD::MOVIedit)
  2132. MAKE_CASE(AArch64ISD::MOVImsl)
  2133. MAKE_CASE(AArch64ISD::FMOV)
  2134. MAKE_CASE(AArch64ISD::MVNIshift)
  2135. MAKE_CASE(AArch64ISD::MVNImsl)
  2136. MAKE_CASE(AArch64ISD::BICi)
  2137. MAKE_CASE(AArch64ISD::ORRi)
  2138. MAKE_CASE(AArch64ISD::BSP)
  2139. MAKE_CASE(AArch64ISD::EXTR)
  2140. MAKE_CASE(AArch64ISD::ZIP1)
  2141. MAKE_CASE(AArch64ISD::ZIP2)
  2142. MAKE_CASE(AArch64ISD::UZP1)
  2143. MAKE_CASE(AArch64ISD::UZP2)
  2144. MAKE_CASE(AArch64ISD::TRN1)
  2145. MAKE_CASE(AArch64ISD::TRN2)
  2146. MAKE_CASE(AArch64ISD::REV16)
  2147. MAKE_CASE(AArch64ISD::REV32)
  2148. MAKE_CASE(AArch64ISD::REV64)
  2149. MAKE_CASE(AArch64ISD::EXT)
  2150. MAKE_CASE(AArch64ISD::SPLICE)
  2151. MAKE_CASE(AArch64ISD::VSHL)
  2152. MAKE_CASE(AArch64ISD::VLSHR)
  2153. MAKE_CASE(AArch64ISD::VASHR)
  2154. MAKE_CASE(AArch64ISD::VSLI)
  2155. MAKE_CASE(AArch64ISD::VSRI)
  2156. MAKE_CASE(AArch64ISD::CMEQ)
  2157. MAKE_CASE(AArch64ISD::CMGE)
  2158. MAKE_CASE(AArch64ISD::CMGT)
  2159. MAKE_CASE(AArch64ISD::CMHI)
  2160. MAKE_CASE(AArch64ISD::CMHS)
  2161. MAKE_CASE(AArch64ISD::FCMEQ)
  2162. MAKE_CASE(AArch64ISD::FCMGE)
  2163. MAKE_CASE(AArch64ISD::FCMGT)
  2164. MAKE_CASE(AArch64ISD::CMEQz)
  2165. MAKE_CASE(AArch64ISD::CMGEz)
  2166. MAKE_CASE(AArch64ISD::CMGTz)
  2167. MAKE_CASE(AArch64ISD::CMLEz)
  2168. MAKE_CASE(AArch64ISD::CMLTz)
  2169. MAKE_CASE(AArch64ISD::FCMEQz)
  2170. MAKE_CASE(AArch64ISD::FCMGEz)
  2171. MAKE_CASE(AArch64ISD::FCMGTz)
  2172. MAKE_CASE(AArch64ISD::FCMLEz)
  2173. MAKE_CASE(AArch64ISD::FCMLTz)
  2174. MAKE_CASE(AArch64ISD::SADDV)
  2175. MAKE_CASE(AArch64ISD::UADDV)
  2176. MAKE_CASE(AArch64ISD::SDOT)
  2177. MAKE_CASE(AArch64ISD::UDOT)
  2178. MAKE_CASE(AArch64ISD::SMINV)
  2179. MAKE_CASE(AArch64ISD::UMINV)
  2180. MAKE_CASE(AArch64ISD::SMAXV)
  2181. MAKE_CASE(AArch64ISD::UMAXV)
  2182. MAKE_CASE(AArch64ISD::SADDV_PRED)
  2183. MAKE_CASE(AArch64ISD::UADDV_PRED)
  2184. MAKE_CASE(AArch64ISD::SMAXV_PRED)
  2185. MAKE_CASE(AArch64ISD::UMAXV_PRED)
  2186. MAKE_CASE(AArch64ISD::SMINV_PRED)
  2187. MAKE_CASE(AArch64ISD::UMINV_PRED)
  2188. MAKE_CASE(AArch64ISD::ORV_PRED)
  2189. MAKE_CASE(AArch64ISD::EORV_PRED)
  2190. MAKE_CASE(AArch64ISD::ANDV_PRED)
  2191. MAKE_CASE(AArch64ISD::CLASTA_N)
  2192. MAKE_CASE(AArch64ISD::CLASTB_N)
  2193. MAKE_CASE(AArch64ISD::LASTA)
  2194. MAKE_CASE(AArch64ISD::LASTB)
  2195. MAKE_CASE(AArch64ISD::REINTERPRET_CAST)
  2196. MAKE_CASE(AArch64ISD::LS64_BUILD)
  2197. MAKE_CASE(AArch64ISD::LS64_EXTRACT)
  2198. MAKE_CASE(AArch64ISD::TBL)
  2199. MAKE_CASE(AArch64ISD::FADD_PRED)
  2200. MAKE_CASE(AArch64ISD::FADDA_PRED)
  2201. MAKE_CASE(AArch64ISD::FADDV_PRED)
  2202. MAKE_CASE(AArch64ISD::FDIV_PRED)
  2203. MAKE_CASE(AArch64ISD::FMA_PRED)
  2204. MAKE_CASE(AArch64ISD::FMAX_PRED)
  2205. MAKE_CASE(AArch64ISD::FMAXV_PRED)
  2206. MAKE_CASE(AArch64ISD::FMAXNM_PRED)
  2207. MAKE_CASE(AArch64ISD::FMAXNMV_PRED)
  2208. MAKE_CASE(AArch64ISD::FMIN_PRED)
  2209. MAKE_CASE(AArch64ISD::FMINV_PRED)
  2210. MAKE_CASE(AArch64ISD::FMINNM_PRED)
  2211. MAKE_CASE(AArch64ISD::FMINNMV_PRED)
  2212. MAKE_CASE(AArch64ISD::FMUL_PRED)
  2213. MAKE_CASE(AArch64ISD::FSUB_PRED)
  2214. MAKE_CASE(AArch64ISD::RDSVL)
  2215. MAKE_CASE(AArch64ISD::BIC)
  2216. MAKE_CASE(AArch64ISD::BIT)
  2217. MAKE_CASE(AArch64ISD::CBZ)
  2218. MAKE_CASE(AArch64ISD::CBNZ)
  2219. MAKE_CASE(AArch64ISD::TBZ)
  2220. MAKE_CASE(AArch64ISD::TBNZ)
  2221. MAKE_CASE(AArch64ISD::TC_RETURN)
  2222. MAKE_CASE(AArch64ISD::PREFETCH)
  2223. MAKE_CASE(AArch64ISD::SITOF)
  2224. MAKE_CASE(AArch64ISD::UITOF)
  2225. MAKE_CASE(AArch64ISD::NVCAST)
  2226. MAKE_CASE(AArch64ISD::MRS)
  2227. MAKE_CASE(AArch64ISD::SQSHL_I)
  2228. MAKE_CASE(AArch64ISD::UQSHL_I)
  2229. MAKE_CASE(AArch64ISD::SRSHR_I)
  2230. MAKE_CASE(AArch64ISD::URSHR_I)
  2231. MAKE_CASE(AArch64ISD::SQSHLU_I)
  2232. MAKE_CASE(AArch64ISD::WrapperLarge)
  2233. MAKE_CASE(AArch64ISD::LD2post)
  2234. MAKE_CASE(AArch64ISD::LD3post)
  2235. MAKE_CASE(AArch64ISD::LD4post)
  2236. MAKE_CASE(AArch64ISD::ST2post)
  2237. MAKE_CASE(AArch64ISD::ST3post)
  2238. MAKE_CASE(AArch64ISD::ST4post)
  2239. MAKE_CASE(AArch64ISD::LD1x2post)
  2240. MAKE_CASE(AArch64ISD::LD1x3post)
  2241. MAKE_CASE(AArch64ISD::LD1x4post)
  2242. MAKE_CASE(AArch64ISD::ST1x2post)
  2243. MAKE_CASE(AArch64ISD::ST1x3post)
  2244. MAKE_CASE(AArch64ISD::ST1x4post)
  2245. MAKE_CASE(AArch64ISD::LD1DUPpost)
  2246. MAKE_CASE(AArch64ISD::LD2DUPpost)
  2247. MAKE_CASE(AArch64ISD::LD3DUPpost)
  2248. MAKE_CASE(AArch64ISD::LD4DUPpost)
  2249. MAKE_CASE(AArch64ISD::LD1LANEpost)
  2250. MAKE_CASE(AArch64ISD::LD2LANEpost)
  2251. MAKE_CASE(AArch64ISD::LD3LANEpost)
  2252. MAKE_CASE(AArch64ISD::LD4LANEpost)
  2253. MAKE_CASE(AArch64ISD::ST2LANEpost)
  2254. MAKE_CASE(AArch64ISD::ST3LANEpost)
  2255. MAKE_CASE(AArch64ISD::ST4LANEpost)
  2256. MAKE_CASE(AArch64ISD::SMULL)
  2257. MAKE_CASE(AArch64ISD::UMULL)
  2258. MAKE_CASE(AArch64ISD::PMULL)
  2259. MAKE_CASE(AArch64ISD::FRECPE)
  2260. MAKE_CASE(AArch64ISD::FRECPS)
  2261. MAKE_CASE(AArch64ISD::FRSQRTE)
  2262. MAKE_CASE(AArch64ISD::FRSQRTS)
  2263. MAKE_CASE(AArch64ISD::STG)
  2264. MAKE_CASE(AArch64ISD::STZG)
  2265. MAKE_CASE(AArch64ISD::ST2G)
  2266. MAKE_CASE(AArch64ISD::STZ2G)
  2267. MAKE_CASE(AArch64ISD::SUNPKHI)
  2268. MAKE_CASE(AArch64ISD::SUNPKLO)
  2269. MAKE_CASE(AArch64ISD::UUNPKHI)
  2270. MAKE_CASE(AArch64ISD::UUNPKLO)
  2271. MAKE_CASE(AArch64ISD::INSR)
  2272. MAKE_CASE(AArch64ISD::PTEST)
  2273. MAKE_CASE(AArch64ISD::PTEST_ANY)
  2274. MAKE_CASE(AArch64ISD::PTRUE)
  2275. MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO)
  2276. MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO)
  2277. MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO)
  2278. MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO)
  2279. MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO)
  2280. MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO)
  2281. MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO)
  2282. MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO)
  2283. MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO)
  2284. MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO)
  2285. MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO)
  2286. MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO)
  2287. MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO)
  2288. MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO)
  2289. MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO)
  2290. MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO)
  2291. MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
  2292. MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
  2293. MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
  2294. MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
  2295. MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
  2296. MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO)
  2297. MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO)
  2298. MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO)
  2299. MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO)
  2300. MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO)
  2301. MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO)
  2302. MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO)
  2303. MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO)
  2304. MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO)
  2305. MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO)
  2306. MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO)
  2307. MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO)
  2308. MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO)
  2309. MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO)
  2310. MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO)
  2311. MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO)
  2312. MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO)
  2313. MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO)
  2314. MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO)
  2315. MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
  2316. MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
  2317. MAKE_CASE(AArch64ISD::ST1_PRED)
  2318. MAKE_CASE(AArch64ISD::SST1_PRED)
  2319. MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
  2320. MAKE_CASE(AArch64ISD::SST1_SXTW_PRED)
  2321. MAKE_CASE(AArch64ISD::SST1_UXTW_PRED)
  2322. MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED)
  2323. MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED)
  2324. MAKE_CASE(AArch64ISD::SST1_IMM_PRED)
  2325. MAKE_CASE(AArch64ISD::SSTNT1_PRED)
  2326. MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED)
  2327. MAKE_CASE(AArch64ISD::LDP)
  2328. MAKE_CASE(AArch64ISD::LDNP)
  2329. MAKE_CASE(AArch64ISD::STP)
  2330. MAKE_CASE(AArch64ISD::STNP)
  2331. MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU)
  2332. MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU)
  2333. MAKE_CASE(AArch64ISD::REVH_MERGE_PASSTHRU)
  2334. MAKE_CASE(AArch64ISD::REVW_MERGE_PASSTHRU)
  2335. MAKE_CASE(AArch64ISD::REVD_MERGE_PASSTHRU)
  2336. MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU)
  2337. MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)
  2338. MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
  2339. MAKE_CASE(AArch64ISD::INDEX_VECTOR)
  2340. MAKE_CASE(AArch64ISD::ADDP)
  2341. MAKE_CASE(AArch64ISD::SADDLP)
  2342. MAKE_CASE(AArch64ISD::UADDLP)
  2343. MAKE_CASE(AArch64ISD::CALL_RVMARKER)
  2344. MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL)
  2345. MAKE_CASE(AArch64ISD::MOPS_MEMSET)
  2346. MAKE_CASE(AArch64ISD::MOPS_MEMSET_TAGGING)
  2347. MAKE_CASE(AArch64ISD::MOPS_MEMCOPY)
  2348. MAKE_CASE(AArch64ISD::MOPS_MEMMOVE)
  2349. MAKE_CASE(AArch64ISD::CALL_BTI)
  2350. MAKE_CASE(AArch64ISD::MRRS)
  2351. MAKE_CASE(AArch64ISD::MSRR)
  2352. }
  2353. #undef MAKE_CASE
  2354. return nullptr;
  2355. }
  2356. MachineBasicBlock *
  2357. AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
  2358. MachineBasicBlock *MBB) const {
  2359. // We materialise the F128CSEL pseudo-instruction as some control flow and a
  2360. // phi node:
  2361. // OrigBB:
  2362. // [... previous instrs leading to comparison ...]
  2363. // b.ne TrueBB
  2364. // b EndBB
  2365. // TrueBB:
  2366. // ; Fallthrough
  2367. // EndBB:
  2368. // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
  2369. MachineFunction *MF = MBB->getParent();
  2370. const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  2371. const BasicBlock *LLVM_BB = MBB->getBasicBlock();
  2372. DebugLoc DL = MI.getDebugLoc();
  2373. MachineFunction::iterator It = ++MBB->getIterator();
  2374. Register DestReg = MI.getOperand(0).getReg();
  2375. Register IfTrueReg = MI.getOperand(1).getReg();
  2376. Register IfFalseReg = MI.getOperand(2).getReg();
  2377. unsigned CondCode = MI.getOperand(3).getImm();
  2378. bool NZCVKilled = MI.getOperand(4).isKill();
  2379. MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
  2380. MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
  2381. MF->insert(It, TrueBB);
  2382. MF->insert(It, EndBB);
  2383. // Transfer rest of current basic-block to EndBB
  2384. EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
  2385. MBB->end());
  2386. EndBB->transferSuccessorsAndUpdatePHIs(MBB);
  2387. BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
  2388. BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
  2389. MBB->addSuccessor(TrueBB);
  2390. MBB->addSuccessor(EndBB);
  2391. // TrueBB falls through to the end.
  2392. TrueBB->addSuccessor(EndBB);
  2393. if (!NZCVKilled) {
  2394. TrueBB->addLiveIn(AArch64::NZCV);
  2395. EndBB->addLiveIn(AArch64::NZCV);
  2396. }
  2397. BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
  2398. .addReg(IfTrueReg)
  2399. .addMBB(TrueBB)
  2400. .addReg(IfFalseReg)
  2401. .addMBB(MBB);
  2402. MI.eraseFromParent();
  2403. return EndBB;
  2404. }
  2405. MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
  2406. MachineInstr &MI, MachineBasicBlock *BB) const {
  2407. assert(!isAsynchronousEHPersonality(classifyEHPersonality(
  2408. BB->getParent()->getFunction().getPersonalityFn())) &&
  2409. "SEH does not use catchret!");
  2410. return BB;
  2411. }
  2412. MachineBasicBlock *
  2413. AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
  2414. MachineInstr &MI,
  2415. MachineBasicBlock *BB) const {
  2416. const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  2417. MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
  2418. MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
  2419. MIB.add(MI.getOperand(1)); // slice index register
  2420. MIB.add(MI.getOperand(2)); // slice index offset
  2421. MIB.add(MI.getOperand(3)); // pg
  2422. MIB.add(MI.getOperand(4)); // base
  2423. MIB.add(MI.getOperand(5)); // offset
  2424. MI.eraseFromParent(); // The pseudo is gone now.
  2425. return BB;
  2426. }
  2427. MachineBasicBlock *
  2428. AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const {
  2429. const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  2430. MachineInstrBuilder MIB =
  2431. BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
  2432. MIB.addReg(AArch64::ZA, RegState::Define);
  2433. MIB.add(MI.getOperand(0)); // Vector select register
  2434. MIB.add(MI.getOperand(1)); // Vector select offset
  2435. MIB.add(MI.getOperand(2)); // Base
  2436. MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
  2437. MI.eraseFromParent(); // The pseudo is gone now.
  2438. return BB;
  2439. }
  2440. MachineBasicBlock *
  2441. AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
  2442. MachineInstr &MI,
  2443. MachineBasicBlock *BB, bool HasTile) const {
  2444. const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  2445. MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
  2446. unsigned StartIdx = 0;
  2447. if (HasTile) {
  2448. MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
  2449. MIB.addReg(BaseReg + MI.getOperand(0).getImm());
  2450. StartIdx = 1;
  2451. } else
  2452. MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
  2453. for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
  2454. MIB.add(MI.getOperand(I));
  2455. MI.eraseFromParent(); // The pseudo is gone now.
  2456. return BB;
  2457. }
  2458. MachineBasicBlock *
  2459. AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const {
  2460. const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  2461. MachineInstrBuilder MIB =
  2462. BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
  2463. MIB.add(MI.getOperand(0)); // Mask
  2464. unsigned Mask = MI.getOperand(0).getImm();
  2465. for (unsigned I = 0; I < 8; I++) {
  2466. if (Mask & (1 << I))
  2467. MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
  2468. }
  2469. MI.eraseFromParent(); // The pseudo is gone now.
  2470. return BB;
  2471. }
  2472. MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
  2473. MachineInstr &MI, MachineBasicBlock *BB) const {
  2474. int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
  2475. if (SMEOrigInstr != -1) {
  2476. const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  2477. uint64_t SMEMatrixType =
  2478. TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
  2479. switch (SMEMatrixType) {
  2480. case (AArch64::SMEMatrixArray):
  2481. return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false);
  2482. case (AArch64::SMEMatrixTileB):
  2483. return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true);
  2484. case (AArch64::SMEMatrixTileH):
  2485. return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true);
  2486. case (AArch64::SMEMatrixTileS):
  2487. return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true);
  2488. case (AArch64::SMEMatrixTileD):
  2489. return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true);
  2490. case (AArch64::SMEMatrixTileQ):
  2491. return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true);
  2492. }
  2493. }
  2494. switch (MI.getOpcode()) {
  2495. default:
  2496. #ifndef NDEBUG
  2497. MI.dump();
  2498. #endif
  2499. llvm_unreachable("Unexpected instruction for custom inserter!");
  2500. case AArch64::F128CSEL:
  2501. return EmitF128CSEL(MI, BB);
  2502. case TargetOpcode::STATEPOINT:
  2503. // STATEPOINT is a pseudo instruction which has no implicit defs/uses
  2504. // while bl call instruction (where statepoint will be lowered at the end)
  2505. // has implicit def. This def is early-clobber as it will be set at
  2506. // the moment of the call and earlier than any use is read.
  2507. // Add this implicit dead def here as a workaround.
  2508. MI.addOperand(*MI.getMF(),
  2509. MachineOperand::CreateReg(
  2510. AArch64::LR, /*isDef*/ true,
  2511. /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
  2512. /*isUndef*/ false, /*isEarlyClobber*/ true));
  2513. [[fallthrough]];
  2514. case TargetOpcode::STACKMAP:
  2515. case TargetOpcode::PATCHPOINT:
  2516. return emitPatchPoint(MI, BB);
  2517. case AArch64::CATCHRET:
  2518. return EmitLoweredCatchRet(MI, BB);
  2519. case AArch64::LD1_MXIPXX_H_PSEUDO_B:
  2520. return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
  2521. case AArch64::LD1_MXIPXX_H_PSEUDO_H:
  2522. return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
  2523. case AArch64::LD1_MXIPXX_H_PSEUDO_S:
  2524. return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
  2525. case AArch64::LD1_MXIPXX_H_PSEUDO_D:
  2526. return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
  2527. case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
  2528. return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
  2529. case AArch64::LD1_MXIPXX_V_PSEUDO_B:
  2530. return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
  2531. case AArch64::LD1_MXIPXX_V_PSEUDO_H:
  2532. return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
  2533. case AArch64::LD1_MXIPXX_V_PSEUDO_S:
  2534. return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
  2535. case AArch64::LD1_MXIPXX_V_PSEUDO_D:
  2536. return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
  2537. case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
  2538. return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
  2539. case AArch64::LDR_ZA_PSEUDO:
  2540. return EmitFill(MI, BB);
  2541. case AArch64::ZERO_M_PSEUDO:
  2542. return EmitZero(MI, BB);
  2543. }
  2544. }
  2545. //===----------------------------------------------------------------------===//
  2546. // AArch64 Lowering private implementation.
  2547. //===----------------------------------------------------------------------===//
  2548. //===----------------------------------------------------------------------===//
  2549. // Lowering Code
  2550. //===----------------------------------------------------------------------===//
  2551. // Forward declarations of SVE fixed length lowering helpers
  2552. static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT);
  2553. static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
  2554. static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
  2555. static SDValue convertFixedMaskToScalableVector(SDValue Mask,
  2556. SelectionDAG &DAG);
  2557. static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
  2558. EVT VT);
  2559. /// isZerosVector - Check whether SDNode N is a zero-filled vector.
  2560. static bool isZerosVector(const SDNode *N) {
  2561. // Look through a bit convert.
  2562. while (N->getOpcode() == ISD::BITCAST)
  2563. N = N->getOperand(0).getNode();
  2564. if (ISD::isConstantSplatVectorAllZeros(N))
  2565. return true;
  2566. if (N->getOpcode() != AArch64ISD::DUP)
  2567. return false;
  2568. auto Opnd0 = N->getOperand(0);
  2569. return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
  2570. }
  2571. /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
  2572. /// CC
  2573. static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
  2574. switch (CC) {
  2575. default:
  2576. llvm_unreachable("Unknown condition code!");
  2577. case ISD::SETNE:
  2578. return AArch64CC::NE;
  2579. case ISD::SETEQ:
  2580. return AArch64CC::EQ;
  2581. case ISD::SETGT:
  2582. return AArch64CC::GT;
  2583. case ISD::SETGE:
  2584. return AArch64CC::GE;
  2585. case ISD::SETLT:
  2586. return AArch64CC::LT;
  2587. case ISD::SETLE:
  2588. return AArch64CC::LE;
  2589. case ISD::SETUGT:
  2590. return AArch64CC::HI;
  2591. case ISD::SETUGE:
  2592. return AArch64CC::HS;
  2593. case ISD::SETULT:
  2594. return AArch64CC::LO;
  2595. case ISD::SETULE:
  2596. return AArch64CC::LS;
  2597. }
  2598. }
  2599. /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
  2600. static void changeFPCCToAArch64CC(ISD::CondCode CC,
  2601. AArch64CC::CondCode &CondCode,
  2602. AArch64CC::CondCode &CondCode2) {
  2603. CondCode2 = AArch64CC::AL;
  2604. switch (CC) {
  2605. default:
  2606. llvm_unreachable("Unknown FP condition!");
  2607. case ISD::SETEQ:
  2608. case ISD::SETOEQ:
  2609. CondCode = AArch64CC::EQ;
  2610. break;
  2611. case ISD::SETGT:
  2612. case ISD::SETOGT:
  2613. CondCode = AArch64CC::GT;
  2614. break;
  2615. case ISD::SETGE:
  2616. case ISD::SETOGE:
  2617. CondCode = AArch64CC::GE;
  2618. break;
  2619. case ISD::SETOLT:
  2620. CondCode = AArch64CC::MI;
  2621. break;
  2622. case ISD::SETOLE:
  2623. CondCode = AArch64CC::LS;
  2624. break;
  2625. case ISD::SETONE:
  2626. CondCode = AArch64CC::MI;
  2627. CondCode2 = AArch64CC::GT;
  2628. break;
  2629. case ISD::SETO:
  2630. CondCode = AArch64CC::VC;
  2631. break;
  2632. case ISD::SETUO:
  2633. CondCode = AArch64CC::VS;
  2634. break;
  2635. case ISD::SETUEQ:
  2636. CondCode = AArch64CC::EQ;
  2637. CondCode2 = AArch64CC::VS;
  2638. break;
  2639. case ISD::SETUGT:
  2640. CondCode = AArch64CC::HI;
  2641. break;
  2642. case ISD::SETUGE:
  2643. CondCode = AArch64CC::PL;
  2644. break;
  2645. case ISD::SETLT:
  2646. case ISD::SETULT:
  2647. CondCode = AArch64CC::LT;
  2648. break;
  2649. case ISD::SETLE:
  2650. case ISD::SETULE:
  2651. CondCode = AArch64CC::LE;
  2652. break;
  2653. case ISD::SETNE:
  2654. case ISD::SETUNE:
  2655. CondCode = AArch64CC::NE;
  2656. break;
  2657. }
  2658. }
  2659. /// Convert a DAG fp condition code to an AArch64 CC.
  2660. /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
  2661. /// should be AND'ed instead of OR'ed.
  2662. static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
  2663. AArch64CC::CondCode &CondCode,
  2664. AArch64CC::CondCode &CondCode2) {
  2665. CondCode2 = AArch64CC::AL;
  2666. switch (CC) {
  2667. default:
  2668. changeFPCCToAArch64CC(CC, CondCode, CondCode2);
  2669. assert(CondCode2 == AArch64CC::AL);
  2670. break;
  2671. case ISD::SETONE:
  2672. // (a one b)
  2673. // == ((a olt b) || (a ogt b))
  2674. // == ((a ord b) && (a une b))
  2675. CondCode = AArch64CC::VC;
  2676. CondCode2 = AArch64CC::NE;
  2677. break;
  2678. case ISD::SETUEQ:
  2679. // (a ueq b)
  2680. // == ((a uno b) || (a oeq b))
  2681. // == ((a ule b) && (a uge b))
  2682. CondCode = AArch64CC::PL;
  2683. CondCode2 = AArch64CC::LE;
  2684. break;
  2685. }
  2686. }
  2687. /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
  2688. /// CC usable with the vector instructions. Fewer operations are available
  2689. /// without a real NZCV register, so we have to use less efficient combinations
  2690. /// to get the same effect.
  2691. static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
  2692. AArch64CC::CondCode &CondCode,
  2693. AArch64CC::CondCode &CondCode2,
  2694. bool &Invert) {
  2695. Invert = false;
  2696. switch (CC) {
  2697. default:
  2698. // Mostly the scalar mappings work fine.
  2699. changeFPCCToAArch64CC(CC, CondCode, CondCode2);
  2700. break;
  2701. case ISD::SETUO:
  2702. Invert = true;
  2703. [[fallthrough]];
  2704. case ISD::SETO:
  2705. CondCode = AArch64CC::MI;
  2706. CondCode2 = AArch64CC::GE;
  2707. break;
  2708. case ISD::SETUEQ:
  2709. case ISD::SETULT:
  2710. case ISD::SETULE:
  2711. case ISD::SETUGT:
  2712. case ISD::SETUGE:
  2713. // All of the compare-mask comparisons are ordered, but we can switch
  2714. // between the two by a double inversion. E.g. ULE == !OGT.
  2715. Invert = true;
  2716. changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
  2717. CondCode, CondCode2);
  2718. break;
  2719. }
  2720. }
  2721. static bool isLegalArithImmed(uint64_t C) {
  2722. // Matches AArch64DAGToDAGISel::SelectArithImmed().
  2723. bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
  2724. LLVM_DEBUG(dbgs() << "Is imm " << C
  2725. << " legal: " << (IsLegal ? "yes\n" : "no\n"));
  2726. return IsLegal;
  2727. }
  2728. // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
  2729. // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
  2730. // can be set differently by this operation. It comes down to whether
  2731. // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
  2732. // everything is fine. If not then the optimization is wrong. Thus general
  2733. // comparisons are only valid if op2 != 0.
  2734. //
  2735. // So, finally, the only LLVM-native comparisons that don't mention C and V
  2736. // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
  2737. // the absence of information about op2.
  2738. static bool isCMN(SDValue Op, ISD::CondCode CC) {
  2739. return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
  2740. (CC == ISD::SETEQ || CC == ISD::SETNE);
  2741. }
  2742. static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
  2743. SelectionDAG &DAG, SDValue Chain,
  2744. bool IsSignaling) {
  2745. EVT VT = LHS.getValueType();
  2746. assert(VT != MVT::f128);
  2747. const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
  2748. if (VT == MVT::f16 && !FullFP16) {
  2749. LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
  2750. {Chain, LHS});
  2751. RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
  2752. {LHS.getValue(1), RHS});
  2753. Chain = RHS.getValue(1);
  2754. VT = MVT::f32;
  2755. }
  2756. unsigned Opcode =
  2757. IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
  2758. return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
  2759. }
  2760. static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
  2761. const SDLoc &dl, SelectionDAG &DAG) {
  2762. EVT VT = LHS.getValueType();
  2763. const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
  2764. if (VT.isFloatingPoint()) {
  2765. assert(VT != MVT::f128);
  2766. if (VT == MVT::f16 && !FullFP16) {
  2767. LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
  2768. RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
  2769. VT = MVT::f32;
  2770. }
  2771. return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
  2772. }
  2773. // The CMP instruction is just an alias for SUBS, and representing it as
  2774. // SUBS means that it's possible to get CSE with subtract operations.
  2775. // A later phase can perform the optimization of setting the destination
  2776. // register to WZR/XZR if it ends up being unused.
  2777. unsigned Opcode = AArch64ISD::SUBS;
  2778. if (isCMN(RHS, CC)) {
  2779. // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
  2780. Opcode = AArch64ISD::ADDS;
  2781. RHS = RHS.getOperand(1);
  2782. } else if (isCMN(LHS, CC)) {
  2783. // As we are looking for EQ/NE compares, the operands can be commuted ; can
  2784. // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
  2785. Opcode = AArch64ISD::ADDS;
  2786. LHS = LHS.getOperand(1);
  2787. } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
  2788. if (LHS.getOpcode() == ISD::AND) {
  2789. // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
  2790. // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
  2791. // of the signed comparisons.
  2792. const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
  2793. DAG.getVTList(VT, MVT_CC),
  2794. LHS.getOperand(0),
  2795. LHS.getOperand(1));
  2796. // Replace all users of (and X, Y) with newly generated (ands X, Y)
  2797. DAG.ReplaceAllUsesWith(LHS, ANDSNode);
  2798. return ANDSNode.getValue(1);
  2799. } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
  2800. // Use result of ANDS
  2801. return LHS.getValue(1);
  2802. }
  2803. }
  2804. return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
  2805. .getValue(1);
  2806. }
  2807. /// \defgroup AArch64CCMP CMP;CCMP matching
  2808. ///
  2809. /// These functions deal with the formation of CMP;CCMP;... sequences.
  2810. /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
  2811. /// a comparison. They set the NZCV flags to a predefined value if their
  2812. /// predicate is false. This allows to express arbitrary conjunctions, for
  2813. /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
  2814. /// expressed as:
  2815. /// cmp A
  2816. /// ccmp B, inv(CB), CA
  2817. /// check for CB flags
  2818. ///
  2819. /// This naturally lets us implement chains of AND operations with SETCC
  2820. /// operands. And we can even implement some other situations by transforming
  2821. /// them:
  2822. /// - We can implement (NEG SETCC) i.e. negating a single comparison by
  2823. /// negating the flags used in a CCMP/FCCMP operations.
  2824. /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
  2825. /// by negating the flags we test for afterwards. i.e.
  2826. /// NEG (CMP CCMP CCCMP ...) can be implemented.
  2827. /// - Note that we can only ever negate all previously processed results.
  2828. /// What we can not implement by flipping the flags to test is a negation
  2829. /// of two sub-trees (because the negation affects all sub-trees emitted so
  2830. /// far, so the 2nd sub-tree we emit would also affect the first).
  2831. /// With those tools we can implement some OR operations:
  2832. /// - (OR (SETCC A) (SETCC B)) can be implemented via:
  2833. /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
  2834. /// - After transforming OR to NEG/AND combinations we may be able to use NEG
  2835. /// elimination rules from earlier to implement the whole thing as a
  2836. /// CCMP/FCCMP chain.
  2837. ///
  2838. /// As complete example:
  2839. /// or (or (setCA (cmp A)) (setCB (cmp B)))
  2840. /// (and (setCC (cmp C)) (setCD (cmp D)))"
  2841. /// can be reassociated to:
  2842. /// or (and (setCC (cmp C)) setCD (cmp D))
  2843. // (or (setCA (cmp A)) (setCB (cmp B)))
  2844. /// can be transformed to:
  2845. /// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
  2846. /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
  2847. /// which can be implemented as:
  2848. /// cmp C
  2849. /// ccmp D, inv(CD), CC
  2850. /// ccmp A, CA, inv(CD)
  2851. /// ccmp B, CB, inv(CA)
  2852. /// check for CB flags
  2853. ///
  2854. /// A counterexample is "or (and A B) (and C D)" which translates to
  2855. /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
  2856. /// can only implement 1 of the inner (not) operations, but not both!
  2857. /// @{
  2858. /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
  2859. static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
  2860. ISD::CondCode CC, SDValue CCOp,
  2861. AArch64CC::CondCode Predicate,
  2862. AArch64CC::CondCode OutCC,
  2863. const SDLoc &DL, SelectionDAG &DAG) {
  2864. unsigned Opcode = 0;
  2865. const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
  2866. if (LHS.getValueType().isFloatingPoint()) {
  2867. assert(LHS.getValueType() != MVT::f128);
  2868. if (LHS.getValueType() == MVT::f16 && !FullFP16) {
  2869. LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
  2870. RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
  2871. }
  2872. Opcode = AArch64ISD::FCCMP;
  2873. } else if (RHS.getOpcode() == ISD::SUB) {
  2874. SDValue SubOp0 = RHS.getOperand(0);
  2875. if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
  2876. // See emitComparison() on why we can only do this for SETEQ and SETNE.
  2877. Opcode = AArch64ISD::CCMN;
  2878. RHS = RHS.getOperand(1);
  2879. }
  2880. }
  2881. if (Opcode == 0)
  2882. Opcode = AArch64ISD::CCMP;
  2883. SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
  2884. AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
  2885. unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
  2886. SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
  2887. return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
  2888. }
  2889. /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
  2890. /// expressed as a conjunction. See \ref AArch64CCMP.
  2891. /// \param CanNegate Set to true if we can negate the whole sub-tree just by
  2892. /// changing the conditions on the SETCC tests.
  2893. /// (this means we can call emitConjunctionRec() with
  2894. /// Negate==true on this sub-tree)
  2895. /// \param MustBeFirst Set to true if this subtree needs to be negated and we
  2896. /// cannot do the negation naturally. We are required to
  2897. /// emit the subtree first in this case.
  2898. /// \param WillNegate Is true if are called when the result of this
  2899. /// subexpression must be negated. This happens when the
  2900. /// outer expression is an OR. We can use this fact to know
  2901. /// that we have a double negation (or (or ...) ...) that
  2902. /// can be implemented for free.
  2903. static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
  2904. bool &MustBeFirst, bool WillNegate,
  2905. unsigned Depth = 0) {
  2906. if (!Val.hasOneUse())
  2907. return false;
  2908. unsigned Opcode = Val->getOpcode();
  2909. if (Opcode == ISD::SETCC) {
  2910. if (Val->getOperand(0).getValueType() == MVT::f128)
  2911. return false;
  2912. CanNegate = true;
  2913. MustBeFirst = false;
  2914. return true;
  2915. }
  2916. // Protect against exponential runtime and stack overflow.
  2917. if (Depth > 6)
  2918. return false;
  2919. if (Opcode == ISD::AND || Opcode == ISD::OR) {
  2920. bool IsOR = Opcode == ISD::OR;
  2921. SDValue O0 = Val->getOperand(0);
  2922. SDValue O1 = Val->getOperand(1);
  2923. bool CanNegateL;
  2924. bool MustBeFirstL;
  2925. if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
  2926. return false;
  2927. bool CanNegateR;
  2928. bool MustBeFirstR;
  2929. if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
  2930. return false;
  2931. if (MustBeFirstL && MustBeFirstR)
  2932. return false;
  2933. if (IsOR) {
  2934. // For an OR expression we need to be able to naturally negate at least
  2935. // one side or we cannot do the transformation at all.
  2936. if (!CanNegateL && !CanNegateR)
  2937. return false;
  2938. // If we the result of the OR will be negated and we can naturally negate
  2939. // the leafs, then this sub-tree as a whole negates naturally.
  2940. CanNegate = WillNegate && CanNegateL && CanNegateR;
  2941. // If we cannot naturally negate the whole sub-tree, then this must be
  2942. // emitted first.
  2943. MustBeFirst = !CanNegate;
  2944. } else {
  2945. assert(Opcode == ISD::AND && "Must be OR or AND");
  2946. // We cannot naturally negate an AND operation.
  2947. CanNegate = false;
  2948. MustBeFirst = MustBeFirstL || MustBeFirstR;
  2949. }
  2950. return true;
  2951. }
  2952. return false;
  2953. }
  2954. /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
  2955. /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
  2956. /// Tries to transform the given i1 producing node @p Val to a series compare
  2957. /// and conditional compare operations. @returns an NZCV flags producing node
  2958. /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
  2959. /// transformation was not possible.
  2960. /// \p Negate is true if we want this sub-tree being negated just by changing
  2961. /// SETCC conditions.
  2962. static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
  2963. AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
  2964. AArch64CC::CondCode Predicate) {
  2965. // We're at a tree leaf, produce a conditional comparison operation.
  2966. unsigned Opcode = Val->getOpcode();
  2967. if (Opcode == ISD::SETCC) {
  2968. SDValue LHS = Val->getOperand(0);
  2969. SDValue RHS = Val->getOperand(1);
  2970. ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
  2971. bool isInteger = LHS.getValueType().isInteger();
  2972. if (Negate)
  2973. CC = getSetCCInverse(CC, LHS.getValueType());
  2974. SDLoc DL(Val);
  2975. // Determine OutCC and handle FP special case.
  2976. if (isInteger) {
  2977. OutCC = changeIntCCToAArch64CC(CC);
  2978. } else {
  2979. assert(LHS.getValueType().isFloatingPoint());
  2980. AArch64CC::CondCode ExtraCC;
  2981. changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
  2982. // Some floating point conditions can't be tested with a single condition
  2983. // code. Construct an additional comparison in this case.
  2984. if (ExtraCC != AArch64CC::AL) {
  2985. SDValue ExtraCmp;
  2986. if (!CCOp.getNode())
  2987. ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
  2988. else
  2989. ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
  2990. ExtraCC, DL, DAG);
  2991. CCOp = ExtraCmp;
  2992. Predicate = ExtraCC;
  2993. }
  2994. }
  2995. // Produce a normal comparison if we are first in the chain
  2996. if (!CCOp)
  2997. return emitComparison(LHS, RHS, CC, DL, DAG);
  2998. // Otherwise produce a ccmp.
  2999. return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
  3000. DAG);
  3001. }
  3002. assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
  3003. bool IsOR = Opcode == ISD::OR;
  3004. SDValue LHS = Val->getOperand(0);
  3005. bool CanNegateL;
  3006. bool MustBeFirstL;
  3007. bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
  3008. assert(ValidL && "Valid conjunction/disjunction tree");
  3009. (void)ValidL;
  3010. SDValue RHS = Val->getOperand(1);
  3011. bool CanNegateR;
  3012. bool MustBeFirstR;
  3013. bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
  3014. assert(ValidR && "Valid conjunction/disjunction tree");
  3015. (void)ValidR;
  3016. // Swap sub-tree that must come first to the right side.
  3017. if (MustBeFirstL) {
  3018. assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
  3019. std::swap(LHS, RHS);
  3020. std::swap(CanNegateL, CanNegateR);
  3021. std::swap(MustBeFirstL, MustBeFirstR);
  3022. }
  3023. bool NegateR;
  3024. bool NegateAfterR;
  3025. bool NegateL;
  3026. bool NegateAfterAll;
  3027. if (Opcode == ISD::OR) {
  3028. // Swap the sub-tree that we can negate naturally to the left.
  3029. if (!CanNegateL) {
  3030. assert(CanNegateR && "at least one side must be negatable");
  3031. assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
  3032. assert(!Negate);
  3033. std::swap(LHS, RHS);
  3034. NegateR = false;
  3035. NegateAfterR = true;
  3036. } else {
  3037. // Negate the left sub-tree if possible, otherwise negate the result.
  3038. NegateR = CanNegateR;
  3039. NegateAfterR = !CanNegateR;
  3040. }
  3041. NegateL = true;
  3042. NegateAfterAll = !Negate;
  3043. } else {
  3044. assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
  3045. assert(!Negate && "Valid conjunction/disjunction tree");
  3046. NegateL = false;
  3047. NegateR = false;
  3048. NegateAfterR = false;
  3049. NegateAfterAll = false;
  3050. }
  3051. // Emit sub-trees.
  3052. AArch64CC::CondCode RHSCC;
  3053. SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
  3054. if (NegateAfterR)
  3055. RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
  3056. SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
  3057. if (NegateAfterAll)
  3058. OutCC = AArch64CC::getInvertedCondCode(OutCC);
  3059. return CmpL;
  3060. }
  3061. /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
  3062. /// In some cases this is even possible with OR operations in the expression.
  3063. /// See \ref AArch64CCMP.
  3064. /// \see emitConjunctionRec().
  3065. static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
  3066. AArch64CC::CondCode &OutCC) {
  3067. bool DummyCanNegate;
  3068. bool DummyMustBeFirst;
  3069. if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
  3070. return SDValue();
  3071. return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
  3072. }
  3073. /// @}
  3074. /// Returns how profitable it is to fold a comparison's operand's shift and/or
  3075. /// extension operations.
  3076. static unsigned getCmpOperandFoldingProfit(SDValue Op) {
  3077. auto isSupportedExtend = [&](SDValue V) {
  3078. if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
  3079. return true;
  3080. if (V.getOpcode() == ISD::AND)
  3081. if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
  3082. uint64_t Mask = MaskCst->getZExtValue();
  3083. return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
  3084. }
  3085. return false;
  3086. };
  3087. if (!Op.hasOneUse())
  3088. return 0;
  3089. if (isSupportedExtend(Op))
  3090. return 1;
  3091. unsigned Opc = Op.getOpcode();
  3092. if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
  3093. if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
  3094. uint64_t Shift = ShiftCst->getZExtValue();
  3095. if (isSupportedExtend(Op.getOperand(0)))
  3096. return (Shift <= 4) ? 2 : 1;
  3097. EVT VT = Op.getValueType();
  3098. if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
  3099. return 1;
  3100. }
  3101. return 0;
  3102. }
  3103. static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
  3104. SDValue &AArch64cc, SelectionDAG &DAG,
  3105. const SDLoc &dl) {
  3106. if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
  3107. EVT VT = RHS.getValueType();
  3108. uint64_t C = RHSC->getZExtValue();
  3109. if (!isLegalArithImmed(C)) {
  3110. // Constant does not fit, try adjusting it by one?
  3111. switch (CC) {
  3112. default:
  3113. break;
  3114. case ISD::SETLT:
  3115. case ISD::SETGE:
  3116. if ((VT == MVT::i32 && C != 0x80000000 &&
  3117. isLegalArithImmed((uint32_t)(C - 1))) ||
  3118. (VT == MVT::i64 && C != 0x80000000ULL &&
  3119. isLegalArithImmed(C - 1ULL))) {
  3120. CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
  3121. C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
  3122. RHS = DAG.getConstant(C, dl, VT);
  3123. }
  3124. break;
  3125. case ISD::SETULT:
  3126. case ISD::SETUGE:
  3127. if ((VT == MVT::i32 && C != 0 &&
  3128. isLegalArithImmed((uint32_t)(C - 1))) ||
  3129. (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
  3130. CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
  3131. C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
  3132. RHS = DAG.getConstant(C, dl, VT);
  3133. }
  3134. break;
  3135. case ISD::SETLE:
  3136. case ISD::SETGT:
  3137. if ((VT == MVT::i32 && C != INT32_MAX &&
  3138. isLegalArithImmed((uint32_t)(C + 1))) ||
  3139. (VT == MVT::i64 && C != INT64_MAX &&
  3140. isLegalArithImmed(C + 1ULL))) {
  3141. CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
  3142. C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
  3143. RHS = DAG.getConstant(C, dl, VT);
  3144. }
  3145. break;
  3146. case ISD::SETULE:
  3147. case ISD::SETUGT:
  3148. if ((VT == MVT::i32 && C != UINT32_MAX &&
  3149. isLegalArithImmed((uint32_t)(C + 1))) ||
  3150. (VT == MVT::i64 && C != UINT64_MAX &&
  3151. isLegalArithImmed(C + 1ULL))) {
  3152. CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
  3153. C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
  3154. RHS = DAG.getConstant(C, dl, VT);
  3155. }
  3156. break;
  3157. }
  3158. }
  3159. }
  3160. // Comparisons are canonicalized so that the RHS operand is simpler than the
  3161. // LHS one, the extreme case being when RHS is an immediate. However, AArch64
  3162. // can fold some shift+extend operations on the RHS operand, so swap the
  3163. // operands if that can be done.
  3164. //
  3165. // For example:
  3166. // lsl w13, w11, #1
  3167. // cmp w13, w12
  3168. // can be turned into:
  3169. // cmp w12, w11, lsl #1
  3170. if (!isa<ConstantSDNode>(RHS) ||
  3171. !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
  3172. SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
  3173. if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) {
  3174. std::swap(LHS, RHS);
  3175. CC = ISD::getSetCCSwappedOperands(CC);
  3176. }
  3177. }
  3178. SDValue Cmp;
  3179. AArch64CC::CondCode AArch64CC;
  3180. if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
  3181. const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
  3182. // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
  3183. // For the i8 operand, the largest immediate is 255, so this can be easily
  3184. // encoded in the compare instruction. For the i16 operand, however, the
  3185. // largest immediate cannot be encoded in the compare.
  3186. // Therefore, use a sign extending load and cmn to avoid materializing the
  3187. // -1 constant. For example,
  3188. // movz w1, #65535
  3189. // ldrh w0, [x0, #0]
  3190. // cmp w0, w1
  3191. // >
  3192. // ldrsh w0, [x0, #0]
  3193. // cmn w0, #1
  3194. // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
  3195. // if and only if (sext LHS) == (sext RHS). The checks are in place to
  3196. // ensure both the LHS and RHS are truly zero extended and to make sure the
  3197. // transformation is profitable.
  3198. if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
  3199. cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
  3200. cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
  3201. LHS.getNode()->hasNUsesOfValue(1, 0)) {
  3202. int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
  3203. if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
  3204. SDValue SExt =
  3205. DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
  3206. DAG.getValueType(MVT::i16));
  3207. Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
  3208. RHS.getValueType()),
  3209. CC, dl, DAG);
  3210. AArch64CC = changeIntCCToAArch64CC(CC);
  3211. }
  3212. }
  3213. if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
  3214. if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
  3215. if ((CC == ISD::SETNE) ^ RHSC->isZero())
  3216. AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
  3217. }
  3218. }
  3219. }
  3220. if (!Cmp) {
  3221. Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
  3222. AArch64CC = changeIntCCToAArch64CC(CC);
  3223. }
  3224. AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
  3225. return Cmp;
  3226. }
  3227. static std::pair<SDValue, SDValue>
  3228. getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
  3229. assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
  3230. "Unsupported value type");
  3231. SDValue Value, Overflow;
  3232. SDLoc DL(Op);
  3233. SDValue LHS = Op.getOperand(0);
  3234. SDValue RHS = Op.getOperand(1);
  3235. unsigned Opc = 0;
  3236. switch (Op.getOpcode()) {
  3237. default:
  3238. llvm_unreachable("Unknown overflow instruction!");
  3239. case ISD::SADDO:
  3240. Opc = AArch64ISD::ADDS;
  3241. CC = AArch64CC::VS;
  3242. break;
  3243. case ISD::UADDO:
  3244. Opc = AArch64ISD::ADDS;
  3245. CC = AArch64CC::HS;
  3246. break;
  3247. case ISD::SSUBO:
  3248. Opc = AArch64ISD::SUBS;
  3249. CC = AArch64CC::VS;
  3250. break;
  3251. case ISD::USUBO:
  3252. Opc = AArch64ISD::SUBS;
  3253. CC = AArch64CC::LO;
  3254. break;
  3255. // Multiply needs a little bit extra work.
  3256. case ISD::SMULO:
  3257. case ISD::UMULO: {
  3258. CC = AArch64CC::NE;
  3259. bool IsSigned = Op.getOpcode() == ISD::SMULO;
  3260. if (Op.getValueType() == MVT::i32) {
  3261. // Extend to 64-bits, then perform a 64-bit multiply.
  3262. unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
  3263. LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
  3264. RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
  3265. SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
  3266. Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
  3267. // Check that the result fits into a 32-bit integer.
  3268. SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
  3269. if (IsSigned) {
  3270. // cmp xreg, wreg, sxtw
  3271. SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
  3272. Overflow =
  3273. DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
  3274. } else {
  3275. // tst xreg, #0xffffffff00000000
  3276. SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
  3277. Overflow =
  3278. DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
  3279. }
  3280. break;
  3281. }
  3282. assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
  3283. // For the 64 bit multiply
  3284. Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
  3285. if (IsSigned) {
  3286. SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
  3287. SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
  3288. DAG.getConstant(63, DL, MVT::i64));
  3289. // It is important that LowerBits is last, otherwise the arithmetic
  3290. // shift will not be folded into the compare (SUBS).
  3291. SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
  3292. Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
  3293. .getValue(1);
  3294. } else {
  3295. SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
  3296. SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
  3297. Overflow =
  3298. DAG.getNode(AArch64ISD::SUBS, DL, VTs,
  3299. DAG.getConstant(0, DL, MVT::i64),
  3300. UpperBits).getValue(1);
  3301. }
  3302. break;
  3303. }
  3304. } // switch (...)
  3305. if (Opc) {
  3306. SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
  3307. // Emit the AArch64 operation with overflow check.
  3308. Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
  3309. Overflow = Value.getValue(1);
  3310. }
  3311. return std::make_pair(Value, Overflow);
  3312. }
  3313. SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
  3314. if (useSVEForFixedLengthVectorVT(Op.getValueType(),
  3315. Subtarget->forceStreamingCompatibleSVE()))
  3316. return LowerToScalableOp(Op, DAG);
  3317. SDValue Sel = Op.getOperand(0);
  3318. SDValue Other = Op.getOperand(1);
  3319. SDLoc dl(Sel);
  3320. // If the operand is an overflow checking operation, invert the condition
  3321. // code and kill the Not operation. I.e., transform:
  3322. // (xor (overflow_op_bool, 1))
  3323. // -->
  3324. // (csel 1, 0, invert(cc), overflow_op_bool)
  3325. // ... which later gets transformed to just a cset instruction with an
  3326. // inverted condition code, rather than a cset + eor sequence.
  3327. if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) {
  3328. // Only lower legal XALUO ops.
  3329. if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
  3330. return SDValue();
  3331. SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
  3332. SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
  3333. AArch64CC::CondCode CC;
  3334. SDValue Value, Overflow;
  3335. std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
  3336. SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
  3337. return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
  3338. CCVal, Overflow);
  3339. }
  3340. // If neither operand is a SELECT_CC, give up.
  3341. if (Sel.getOpcode() != ISD::SELECT_CC)
  3342. std::swap(Sel, Other);
  3343. if (Sel.getOpcode() != ISD::SELECT_CC)
  3344. return Op;
  3345. // The folding we want to perform is:
  3346. // (xor x, (select_cc a, b, cc, 0, -1) )
  3347. // -->
  3348. // (csel x, (xor x, -1), cc ...)
  3349. //
  3350. // The latter will get matched to a CSINV instruction.
  3351. ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
  3352. SDValue LHS = Sel.getOperand(0);
  3353. SDValue RHS = Sel.getOperand(1);
  3354. SDValue TVal = Sel.getOperand(2);
  3355. SDValue FVal = Sel.getOperand(3);
  3356. // FIXME: This could be generalized to non-integer comparisons.
  3357. if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
  3358. return Op;
  3359. ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
  3360. ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
  3361. // The values aren't constants, this isn't the pattern we're looking for.
  3362. if (!CFVal || !CTVal)
  3363. return Op;
  3364. // We can commute the SELECT_CC by inverting the condition. This
  3365. // might be needed to make this fit into a CSINV pattern.
  3366. if (CTVal->isAllOnes() && CFVal->isZero()) {
  3367. std::swap(TVal, FVal);
  3368. std::swap(CTVal, CFVal);
  3369. CC = ISD::getSetCCInverse(CC, LHS.getValueType());
  3370. }
  3371. // If the constants line up, perform the transform!
  3372. if (CTVal->isZero() && CFVal->isAllOnes()) {
  3373. SDValue CCVal;
  3374. SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
  3375. FVal = Other;
  3376. TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
  3377. DAG.getConstant(-1ULL, dl, Other.getValueType()));
  3378. return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
  3379. CCVal, Cmp);
  3380. }
  3381. return Op;
  3382. }
  3383. // If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
  3384. // bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
  3385. // sets 'C' bit to 0.
  3386. static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) {
  3387. SDLoc DL(Value);
  3388. EVT VT = Value.getValueType();
  3389. SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
  3390. SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
  3391. SDValue Cmp =
  3392. DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
  3393. return Cmp.getValue(1);
  3394. }
  3395. // If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
  3396. // If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
  3397. static SDValue carryFlagToValue(SDValue Flag, EVT VT, SelectionDAG &DAG,
  3398. bool Invert) {
  3399. assert(Flag.getResNo() == 1);
  3400. SDLoc DL(Flag);
  3401. SDValue Zero = DAG.getConstant(0, DL, VT);
  3402. SDValue One = DAG.getConstant(1, DL, VT);
  3403. unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
  3404. SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
  3405. return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Flag);
  3406. }
  3407. // Value is 1 if 'V' bit of NZCV is 1, else 0
  3408. static SDValue overflowFlagToValue(SDValue Flag, EVT VT, SelectionDAG &DAG) {
  3409. assert(Flag.getResNo() == 1);
  3410. SDLoc DL(Flag);
  3411. SDValue Zero = DAG.getConstant(0, DL, VT);
  3412. SDValue One = DAG.getConstant(1, DL, VT);
  3413. SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
  3414. return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Flag);
  3415. }
  3416. // This lowering is inefficient, but it will get cleaned up by
  3417. // `foldOverflowCheck`
  3418. static SDValue lowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode,
  3419. bool IsSigned) {
  3420. EVT VT0 = Op.getValue(0).getValueType();
  3421. EVT VT1 = Op.getValue(1).getValueType();
  3422. if (VT0 != MVT::i32 && VT0 != MVT::i64)
  3423. return SDValue();
  3424. bool InvertCarry = Opcode == AArch64ISD::SBCS;
  3425. SDValue OpLHS = Op.getOperand(0);
  3426. SDValue OpRHS = Op.getOperand(1);
  3427. SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
  3428. SDLoc DL(Op);
  3429. SDVTList VTs = DAG.getVTList(VT0, VT1);
  3430. SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
  3431. OpRHS, OpCarryIn);
  3432. SDValue OutFlag =
  3433. IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
  3434. : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
  3435. return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
  3436. }
  3437. static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
  3438. // Let legalize expand this if it isn't a legal type yet.
  3439. if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
  3440. return SDValue();
  3441. SDLoc dl(Op);
  3442. AArch64CC::CondCode CC;
  3443. // The actual operation that sets the overflow or carry flag.
  3444. SDValue Value, Overflow;
  3445. std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
  3446. // We use 0 and 1 as false and true values.
  3447. SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
  3448. SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
  3449. // We use an inverted condition, because the conditional select is inverted
  3450. // too. This will allow it to be selected to a single instruction:
  3451. // CSINC Wd, WZR, WZR, invert(cond).
  3452. SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
  3453. Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
  3454. CCVal, Overflow);
  3455. SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
  3456. return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
  3457. }
  3458. // Prefetch operands are:
  3459. // 1: Address to prefetch
  3460. // 2: bool isWrite
  3461. // 3: int locality (0 = no locality ... 3 = extreme locality)
  3462. // 4: bool isDataCache
  3463. static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
  3464. SDLoc DL(Op);
  3465. unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
  3466. unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
  3467. unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
  3468. bool IsStream = !Locality;
  3469. // When the locality number is set
  3470. if (Locality) {
  3471. // The front-end should have filtered out the out-of-range values
  3472. assert(Locality <= 3 && "Prefetch locality out-of-range");
  3473. // The locality degree is the opposite of the cache speed.
  3474. // Put the number the other way around.
  3475. // The encoding starts at 0 for level 1
  3476. Locality = 3 - Locality;
  3477. }
  3478. // built the mask value encoding the expected behavior.
  3479. unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
  3480. (!IsData << 3) | // IsDataCache bit
  3481. (Locality << 1) | // Cache level bits
  3482. (unsigned)IsStream; // Stream bit
  3483. return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
  3484. DAG.getTargetConstant(PrfOp, DL, MVT::i32),
  3485. Op.getOperand(1));
  3486. }
  3487. SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
  3488. SelectionDAG &DAG) const {
  3489. EVT VT = Op.getValueType();
  3490. if (VT.isScalableVector())
  3491. return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
  3492. if (useSVEForFixedLengthVectorVT(VT))
  3493. return LowerFixedLengthFPExtendToSVE(Op, DAG);
  3494. assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
  3495. return SDValue();
  3496. }
  3497. SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
  3498. SelectionDAG &DAG) const {
  3499. if (Op.getValueType().isScalableVector())
  3500. return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
  3501. bool IsStrict = Op->isStrictFPOpcode();
  3502. SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
  3503. EVT SrcVT = SrcVal.getValueType();
  3504. if (useSVEForFixedLengthVectorVT(SrcVT,
  3505. Subtarget->forceStreamingCompatibleSVE()))
  3506. return LowerFixedLengthFPRoundToSVE(Op, DAG);
  3507. if (SrcVT != MVT::f128) {
  3508. // Expand cases where the input is a vector bigger than NEON.
  3509. if (useSVEForFixedLengthVectorVT(SrcVT))
  3510. return SDValue();
  3511. // It's legal except when f128 is involved
  3512. return Op;
  3513. }
  3514. return SDValue();
  3515. }
  3516. SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
  3517. SelectionDAG &DAG) const {
  3518. // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
  3519. // Any additional optimization in this function should be recorded
  3520. // in the cost tables.
  3521. bool IsStrict = Op->isStrictFPOpcode();
  3522. EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
  3523. EVT VT = Op.getValueType();
  3524. if (VT.isScalableVector()) {
  3525. unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
  3526. ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
  3527. : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
  3528. return LowerToPredicatedOp(Op, DAG, Opcode);
  3529. }
  3530. if (useSVEForFixedLengthVectorVT(VT,
  3531. Subtarget->forceStreamingCompatibleSVE()) ||
  3532. useSVEForFixedLengthVectorVT(InVT,
  3533. Subtarget->forceStreamingCompatibleSVE()))
  3534. return LowerFixedLengthFPToIntToSVE(Op, DAG);
  3535. unsigned NumElts = InVT.getVectorNumElements();
  3536. // f16 conversions are promoted to f32 when full fp16 is not supported.
  3537. if (InVT.getVectorElementType() == MVT::f16 &&
  3538. !Subtarget->hasFullFP16()) {
  3539. MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
  3540. SDLoc dl(Op);
  3541. if (IsStrict) {
  3542. SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
  3543. {Op.getOperand(0), Op.getOperand(1)});
  3544. return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
  3545. {Ext.getValue(1), Ext.getValue(0)});
  3546. }
  3547. return DAG.getNode(
  3548. Op.getOpcode(), dl, Op.getValueType(),
  3549. DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
  3550. }
  3551. uint64_t VTSize = VT.getFixedSizeInBits();
  3552. uint64_t InVTSize = InVT.getFixedSizeInBits();
  3553. if (VTSize < InVTSize) {
  3554. SDLoc dl(Op);
  3555. if (IsStrict) {
  3556. InVT = InVT.changeVectorElementTypeToInteger();
  3557. SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
  3558. {Op.getOperand(0), Op.getOperand(1)});
  3559. SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
  3560. return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
  3561. }
  3562. SDValue Cv =
  3563. DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
  3564. Op.getOperand(0));
  3565. return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
  3566. }
  3567. if (VTSize > InVTSize) {
  3568. SDLoc dl(Op);
  3569. MVT ExtVT =
  3570. MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
  3571. VT.getVectorNumElements());
  3572. if (IsStrict) {
  3573. SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
  3574. {Op.getOperand(0), Op.getOperand(1)});
  3575. return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
  3576. {Ext.getValue(1), Ext.getValue(0)});
  3577. }
  3578. SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
  3579. return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
  3580. }
  3581. // Use a scalar operation for conversions between single-element vectors of
  3582. // the same size.
  3583. if (NumElts == 1) {
  3584. SDLoc dl(Op);
  3585. SDValue Extract = DAG.getNode(
  3586. ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
  3587. Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
  3588. EVT ScalarVT = VT.getScalarType();
  3589. if (IsStrict)
  3590. return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
  3591. {Op.getOperand(0), Extract});
  3592. return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
  3593. }
  3594. // Type changing conversions are illegal.
  3595. return Op;
  3596. }
  3597. SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
  3598. SelectionDAG &DAG) const {
  3599. bool IsStrict = Op->isStrictFPOpcode();
  3600. SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
  3601. if (SrcVal.getValueType().isVector())
  3602. return LowerVectorFP_TO_INT(Op, DAG);
  3603. // f16 conversions are promoted to f32 when full fp16 is not supported.
  3604. if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
  3605. SDLoc dl(Op);
  3606. if (IsStrict) {
  3607. SDValue Ext =
  3608. DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
  3609. {Op.getOperand(0), SrcVal});
  3610. return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
  3611. {Ext.getValue(1), Ext.getValue(0)});
  3612. }
  3613. return DAG.getNode(
  3614. Op.getOpcode(), dl, Op.getValueType(),
  3615. DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
  3616. }
  3617. if (SrcVal.getValueType() != MVT::f128) {
  3618. // It's legal except when f128 is involved
  3619. return Op;
  3620. }
  3621. return SDValue();
  3622. }
  3623. SDValue
  3624. AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
  3625. SelectionDAG &DAG) const {
  3626. // AArch64 FP-to-int conversions saturate to the destination element size, so
  3627. // we can lower common saturating conversions to simple instructions.
  3628. SDValue SrcVal = Op.getOperand(0);
  3629. EVT SrcVT = SrcVal.getValueType();
  3630. EVT DstVT = Op.getValueType();
  3631. EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
  3632. uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
  3633. uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
  3634. uint64_t SatWidth = SatVT.getScalarSizeInBits();
  3635. assert(SatWidth <= DstElementWidth &&
  3636. "Saturation width cannot exceed result width");
  3637. // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
  3638. // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
  3639. // types, so this is hard to reach.
  3640. if (DstVT.isScalableVector())
  3641. return SDValue();
  3642. EVT SrcElementVT = SrcVT.getVectorElementType();
  3643. // In the absence of FP16 support, promote f16 to f32 and saturate the result.
  3644. if (SrcElementVT == MVT::f16 &&
  3645. (!Subtarget->hasFullFP16() || DstElementWidth > 16)) {
  3646. MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
  3647. SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal);
  3648. SrcVT = F32VT;
  3649. SrcElementVT = MVT::f32;
  3650. SrcElementWidth = 32;
  3651. } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
  3652. SrcElementVT != MVT::f16)
  3653. return SDValue();
  3654. SDLoc DL(Op);
  3655. // Cases that we can emit directly.
  3656. if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
  3657. return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
  3658. DAG.getValueType(DstVT.getScalarType()));
  3659. // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
  3660. // result. This is only valid if the legal cvt is larger than the saturate
  3661. // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
  3662. // (at least until sqxtn is selected).
  3663. if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
  3664. return SDValue();
  3665. EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
  3666. SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
  3667. DAG.getValueType(IntVT.getScalarType()));
  3668. SDValue Sat;
  3669. if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
  3670. SDValue MinC = DAG.getConstant(
  3671. APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
  3672. SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
  3673. SDValue MaxC = DAG.getConstant(
  3674. APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
  3675. Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
  3676. } else {
  3677. SDValue MinC = DAG.getConstant(
  3678. APInt::getAllOnesValue(SatWidth).zext(SrcElementWidth), DL, IntVT);
  3679. Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
  3680. }
  3681. return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
  3682. }
  3683. SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
  3684. SelectionDAG &DAG) const {
  3685. // AArch64 FP-to-int conversions saturate to the destination register size, so
  3686. // we can lower common saturating conversions to simple instructions.
  3687. SDValue SrcVal = Op.getOperand(0);
  3688. EVT SrcVT = SrcVal.getValueType();
  3689. if (SrcVT.isVector())
  3690. return LowerVectorFP_TO_INT_SAT(Op, DAG);
  3691. EVT DstVT = Op.getValueType();
  3692. EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
  3693. uint64_t SatWidth = SatVT.getScalarSizeInBits();
  3694. uint64_t DstWidth = DstVT.getScalarSizeInBits();
  3695. assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
  3696. // In the absence of FP16 support, promote f16 to f32 and saturate the result.
  3697. if (SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) {
  3698. SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
  3699. SrcVT = MVT::f32;
  3700. } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16)
  3701. return SDValue();
  3702. SDLoc DL(Op);
  3703. // Cases that we can emit directly.
  3704. if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
  3705. (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
  3706. DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
  3707. return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
  3708. DAG.getValueType(DstVT));
  3709. // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
  3710. // result. This is only valid if the legal cvt is larger than the saturate
  3711. // width.
  3712. if (DstWidth < SatWidth)
  3713. return SDValue();
  3714. SDValue NativeCvt =
  3715. DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
  3716. SDValue Sat;
  3717. if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
  3718. SDValue MinC = DAG.getConstant(
  3719. APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
  3720. SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
  3721. SDValue MaxC = DAG.getConstant(
  3722. APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
  3723. Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
  3724. } else {
  3725. SDValue MinC = DAG.getConstant(
  3726. APInt::getAllOnesValue(SatWidth).zext(DstWidth), DL, DstVT);
  3727. Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
  3728. }
  3729. return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
  3730. }
  3731. SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
  3732. SelectionDAG &DAG) const {
  3733. // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
  3734. // Any additional optimization in this function should be recorded
  3735. // in the cost tables.
  3736. bool IsStrict = Op->isStrictFPOpcode();
  3737. EVT VT = Op.getValueType();
  3738. SDLoc dl(Op);
  3739. SDValue In = Op.getOperand(IsStrict ? 1 : 0);
  3740. EVT InVT = In.getValueType();
  3741. unsigned Opc = Op.getOpcode();
  3742. bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
  3743. if (VT.isScalableVector()) {
  3744. if (InVT.getVectorElementType() == MVT::i1) {
  3745. // We can't directly extend an SVE predicate; extend it first.
  3746. unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
  3747. EVT CastVT = getPromotedVTForPredicate(InVT);
  3748. In = DAG.getNode(CastOpc, dl, CastVT, In);
  3749. return DAG.getNode(Opc, dl, VT, In);
  3750. }
  3751. unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
  3752. : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
  3753. return LowerToPredicatedOp(Op, DAG, Opcode);
  3754. }
  3755. if (useSVEForFixedLengthVectorVT(VT,
  3756. Subtarget->forceStreamingCompatibleSVE()) ||
  3757. useSVEForFixedLengthVectorVT(InVT,
  3758. Subtarget->forceStreamingCompatibleSVE()))
  3759. return LowerFixedLengthIntToFPToSVE(Op, DAG);
  3760. uint64_t VTSize = VT.getFixedSizeInBits();
  3761. uint64_t InVTSize = InVT.getFixedSizeInBits();
  3762. if (VTSize < InVTSize) {
  3763. MVT CastVT =
  3764. MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
  3765. InVT.getVectorNumElements());
  3766. if (IsStrict) {
  3767. In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
  3768. {Op.getOperand(0), In});
  3769. return DAG.getNode(
  3770. ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
  3771. {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)});
  3772. }
  3773. In = DAG.getNode(Opc, dl, CastVT, In);
  3774. return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
  3775. DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
  3776. }
  3777. if (VTSize > InVTSize) {
  3778. unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
  3779. EVT CastVT = VT.changeVectorElementTypeToInteger();
  3780. In = DAG.getNode(CastOpc, dl, CastVT, In);
  3781. if (IsStrict)
  3782. return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
  3783. return DAG.getNode(Opc, dl, VT, In);
  3784. }
  3785. // Use a scalar operation for conversions between single-element vectors of
  3786. // the same size.
  3787. if (VT.getVectorNumElements() == 1) {
  3788. SDValue Extract = DAG.getNode(
  3789. ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
  3790. In, DAG.getConstant(0, dl, MVT::i64));
  3791. EVT ScalarVT = VT.getScalarType();
  3792. if (IsStrict)
  3793. return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
  3794. {Op.getOperand(0), Extract});
  3795. return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
  3796. }
  3797. return Op;
  3798. }
  3799. SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
  3800. SelectionDAG &DAG) const {
  3801. if (Op.getValueType().isVector())
  3802. return LowerVectorINT_TO_FP(Op, DAG);
  3803. bool IsStrict = Op->isStrictFPOpcode();
  3804. SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
  3805. // f16 conversions are promoted to f32 when full fp16 is not supported.
  3806. if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
  3807. SDLoc dl(Op);
  3808. if (IsStrict) {
  3809. SDValue Val = DAG.getNode(Op.getOpcode(), dl, {MVT::f32, MVT::Other},
  3810. {Op.getOperand(0), SrcVal});
  3811. return DAG.getNode(
  3812. ISD::STRICT_FP_ROUND, dl, {MVT::f16, MVT::Other},
  3813. {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
  3814. }
  3815. return DAG.getNode(
  3816. ISD::FP_ROUND, dl, MVT::f16,
  3817. DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
  3818. DAG.getIntPtrConstant(0, dl));
  3819. }
  3820. // i128 conversions are libcalls.
  3821. if (SrcVal.getValueType() == MVT::i128)
  3822. return SDValue();
  3823. // Other conversions are legal, unless it's to the completely software-based
  3824. // fp128.
  3825. if (Op.getValueType() != MVT::f128)
  3826. return Op;
  3827. return SDValue();
  3828. }
  3829. SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
  3830. SelectionDAG &DAG) const {
  3831. // For iOS, we want to call an alternative entry point: __sincos_stret,
  3832. // which returns the values in two S / D registers.
  3833. SDLoc dl(Op);
  3834. SDValue Arg = Op.getOperand(0);
  3835. EVT ArgVT = Arg.getValueType();
  3836. Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
  3837. ArgListTy Args;
  3838. ArgListEntry Entry;
  3839. Entry.Node = Arg;
  3840. Entry.Ty = ArgTy;
  3841. Entry.IsSExt = false;
  3842. Entry.IsZExt = false;
  3843. Args.push_back(Entry);
  3844. RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
  3845. : RTLIB::SINCOS_STRET_F32;
  3846. const char *LibcallName = getLibcallName(LC);
  3847. SDValue Callee =
  3848. DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
  3849. StructType *RetTy = StructType::get(ArgTy, ArgTy);
  3850. TargetLowering::CallLoweringInfo CLI(DAG);
  3851. CLI.setDebugLoc(dl)
  3852. .setChain(DAG.getEntryNode())
  3853. .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
  3854. std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
  3855. return CallResult.first;
  3856. }
  3857. static MVT getSVEContainerType(EVT ContentTy);
  3858. SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
  3859. SelectionDAG &DAG) const {
  3860. EVT OpVT = Op.getValueType();
  3861. EVT ArgVT = Op.getOperand(0).getValueType();
  3862. if (useSVEForFixedLengthVectorVT(OpVT))
  3863. return LowerFixedLengthBitcastToSVE(Op, DAG);
  3864. if (OpVT.isScalableVector()) {
  3865. // Bitcasting between unpacked vector types of different element counts is
  3866. // not a NOP because the live elements are laid out differently.
  3867. // 01234567
  3868. // e.g. nxv2i32 = XX??XX??
  3869. // nxv4f16 = X?X?X?X?
  3870. if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
  3871. return SDValue();
  3872. if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
  3873. assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
  3874. "Expected int->fp bitcast!");
  3875. SDValue ExtResult =
  3876. DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT),
  3877. Op.getOperand(0));
  3878. return getSVESafeBitCast(OpVT, ExtResult, DAG);
  3879. }
  3880. return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
  3881. }
  3882. if (OpVT != MVT::f16 && OpVT != MVT::bf16)
  3883. return SDValue();
  3884. // Bitcasts between f16 and bf16 are legal.
  3885. if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
  3886. return Op;
  3887. assert(ArgVT == MVT::i16);
  3888. SDLoc DL(Op);
  3889. Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
  3890. Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
  3891. return SDValue(
  3892. DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op,
  3893. DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
  3894. 0);
  3895. }
  3896. static EVT getExtensionTo64Bits(const EVT &OrigVT) {
  3897. if (OrigVT.getSizeInBits() >= 64)
  3898. return OrigVT;
  3899. assert(OrigVT.isSimple() && "Expecting a simple value type");
  3900. MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
  3901. switch (OrigSimpleTy) {
  3902. default: llvm_unreachable("Unexpected Vector Type");
  3903. case MVT::v2i8:
  3904. case MVT::v2i16:
  3905. return MVT::v2i32;
  3906. case MVT::v4i8:
  3907. return MVT::v4i16;
  3908. }
  3909. }
  3910. static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
  3911. const EVT &OrigTy,
  3912. const EVT &ExtTy,
  3913. unsigned ExtOpcode) {
  3914. // The vector originally had a size of OrigTy. It was then extended to ExtTy.
  3915. // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
  3916. // 64-bits we need to insert a new extension so that it will be 64-bits.
  3917. assert(ExtTy.is128BitVector() && "Unexpected extension size");
  3918. if (OrigTy.getSizeInBits() >= 64)
  3919. return N;
  3920. // Must extend size to at least 64 bits to be used as an operand for VMULL.
  3921. EVT NewVT = getExtensionTo64Bits(OrigTy);
  3922. return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
  3923. }
  3924. // Returns lane if Op extracts from a two-element vector and lane is constant
  3925. // (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
  3926. static std::optional<uint64_t>
  3927. getConstantLaneNumOfExtractHalfOperand(SDValue &Op) {
  3928. SDNode *OpNode = Op.getNode();
  3929. if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
  3930. return std::nullopt;
  3931. EVT VT = OpNode->getOperand(0).getValueType();
  3932. ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
  3933. if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
  3934. return std::nullopt;
  3935. return C->getZExtValue();
  3936. }
  3937. static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
  3938. bool isSigned) {
  3939. EVT VT = N->getValueType(0);
  3940. if (N->getOpcode() != ISD::BUILD_VECTOR)
  3941. return false;
  3942. for (const SDValue &Elt : N->op_values()) {
  3943. if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
  3944. unsigned EltSize = VT.getScalarSizeInBits();
  3945. unsigned HalfSize = EltSize / 2;
  3946. if (isSigned) {
  3947. if (!isIntN(HalfSize, C->getSExtValue()))
  3948. return false;
  3949. } else {
  3950. if (!isUIntN(HalfSize, C->getZExtValue()))
  3951. return false;
  3952. }
  3953. continue;
  3954. }
  3955. return false;
  3956. }
  3957. return true;
  3958. }
  3959. static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
  3960. if (N->getOpcode() == ISD::SIGN_EXTEND ||
  3961. N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
  3962. return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
  3963. N->getOperand(0)->getValueType(0),
  3964. N->getValueType(0),
  3965. N->getOpcode());
  3966. assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
  3967. EVT VT = N->getValueType(0);
  3968. SDLoc dl(N);
  3969. unsigned EltSize = VT.getScalarSizeInBits() / 2;
  3970. unsigned NumElts = VT.getVectorNumElements();
  3971. MVT TruncVT = MVT::getIntegerVT(EltSize);
  3972. SmallVector<SDValue, 8> Ops;
  3973. for (unsigned i = 0; i != NumElts; ++i) {
  3974. ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
  3975. const APInt &CInt = C->getAPIntValue();
  3976. // Element types smaller than 32 bits are not legal, so use i32 elements.
  3977. // The values are implicitly truncated so sext vs. zext doesn't matter.
  3978. Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
  3979. }
  3980. return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
  3981. }
  3982. static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
  3983. return N->getOpcode() == ISD::SIGN_EXTEND ||
  3984. N->getOpcode() == ISD::ANY_EXTEND ||
  3985. isExtendedBUILD_VECTOR(N, DAG, true);
  3986. }
  3987. static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
  3988. return N->getOpcode() == ISD::ZERO_EXTEND ||
  3989. N->getOpcode() == ISD::ANY_EXTEND ||
  3990. isExtendedBUILD_VECTOR(N, DAG, false);
  3991. }
  3992. static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
  3993. unsigned Opcode = N->getOpcode();
  3994. if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
  3995. SDNode *N0 = N->getOperand(0).getNode();
  3996. SDNode *N1 = N->getOperand(1).getNode();
  3997. return N0->hasOneUse() && N1->hasOneUse() &&
  3998. isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
  3999. }
  4000. return false;
  4001. }
  4002. static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
  4003. unsigned Opcode = N->getOpcode();
  4004. if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
  4005. SDNode *N0 = N->getOperand(0).getNode();
  4006. SDNode *N1 = N->getOperand(1).getNode();
  4007. return N0->hasOneUse() && N1->hasOneUse() &&
  4008. isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
  4009. }
  4010. return false;
  4011. }
  4012. SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
  4013. SelectionDAG &DAG) const {
  4014. // The rounding mode is in bits 23:22 of the FPSCR.
  4015. // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
  4016. // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
  4017. // so that the shift + and get folded into a bitfield extract.
  4018. SDLoc dl(Op);
  4019. SDValue Chain = Op.getOperand(0);
  4020. SDValue FPCR_64 = DAG.getNode(
  4021. ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
  4022. {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
  4023. Chain = FPCR_64.getValue(1);
  4024. SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
  4025. SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
  4026. DAG.getConstant(1U << 22, dl, MVT::i32));
  4027. SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
  4028. DAG.getConstant(22, dl, MVT::i32));
  4029. SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
  4030. DAG.getConstant(3, dl, MVT::i32));
  4031. return DAG.getMergeValues({AND, Chain}, dl);
  4032. }
  4033. SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
  4034. SelectionDAG &DAG) const {
  4035. SDLoc DL(Op);
  4036. SDValue Chain = Op->getOperand(0);
  4037. SDValue RMValue = Op->getOperand(1);
  4038. // The rounding mode is in bits 23:22 of the FPCR.
  4039. // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
  4040. // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
  4041. // ((arg - 1) & 3) << 22).
  4042. //
  4043. // The argument of llvm.set.rounding must be within the segment [0, 3], so
  4044. // NearestTiesToAway (4) is not handled here. It is responsibility of the code
  4045. // generated llvm.set.rounding to ensure this condition.
  4046. // Calculate new value of FPCR[23:22].
  4047. RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
  4048. DAG.getConstant(1, DL, MVT::i32));
  4049. RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
  4050. DAG.getConstant(0x3, DL, MVT::i32));
  4051. RMValue =
  4052. DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
  4053. DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
  4054. RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
  4055. // Get current value of FPCR.
  4056. SDValue Ops[] = {
  4057. Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
  4058. SDValue FPCR =
  4059. DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
  4060. Chain = FPCR.getValue(1);
  4061. FPCR = FPCR.getValue(0);
  4062. // Put new rounding mode into FPSCR[23:22].
  4063. const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
  4064. FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
  4065. DAG.getConstant(RMMask, DL, MVT::i64));
  4066. FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
  4067. SDValue Ops2[] = {
  4068. Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
  4069. FPCR};
  4070. return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
  4071. }
  4072. static unsigned selectUmullSmull(SDNode *&N0, SDNode *&N1, SelectionDAG &DAG,
  4073. SDLoc DL, bool &IsMLA) {
  4074. bool IsN0SExt = isSignExtended(N0, DAG);
  4075. bool IsN1SExt = isSignExtended(N1, DAG);
  4076. if (IsN0SExt && IsN1SExt)
  4077. return AArch64ISD::SMULL;
  4078. bool IsN0ZExt = isZeroExtended(N0, DAG);
  4079. bool IsN1ZExt = isZeroExtended(N1, DAG);
  4080. if (IsN0ZExt && IsN1ZExt)
  4081. return AArch64ISD::UMULL;
  4082. // Select SMULL if we can replace zext with sext.
  4083. if (((IsN0SExt && IsN1ZExt) || (IsN0ZExt && IsN1SExt)) &&
  4084. !isExtendedBUILD_VECTOR(N0, DAG, false) &&
  4085. !isExtendedBUILD_VECTOR(N1, DAG, false)) {
  4086. SDValue ZextOperand;
  4087. if (IsN0ZExt)
  4088. ZextOperand = N0->getOperand(0);
  4089. else
  4090. ZextOperand = N1->getOperand(0);
  4091. if (DAG.SignBitIsZero(ZextOperand)) {
  4092. SDNode *NewSext =
  4093. DAG.getSExtOrTrunc(ZextOperand, DL, N0->getValueType(0)).getNode();
  4094. if (IsN0ZExt)
  4095. N0 = NewSext;
  4096. else
  4097. N1 = NewSext;
  4098. return AArch64ISD::SMULL;
  4099. }
  4100. }
  4101. // Select UMULL if we can replace the other operand with an extend.
  4102. if (IsN0ZExt || IsN1ZExt) {
  4103. EVT VT = N0->getValueType(0);
  4104. APInt Mask = APInt::getHighBitsSet(VT.getScalarSizeInBits(),
  4105. VT.getScalarSizeInBits() / 2);
  4106. if (DAG.MaskedValueIsZero(SDValue(IsN0ZExt ? N1 : N0, 0), Mask)) {
  4107. EVT HalfVT;
  4108. switch (VT.getSimpleVT().SimpleTy) {
  4109. case MVT::v2i64:
  4110. HalfVT = MVT::v2i32;
  4111. break;
  4112. case MVT::v4i32:
  4113. HalfVT = MVT::v4i16;
  4114. break;
  4115. case MVT::v8i16:
  4116. HalfVT = MVT::v8i8;
  4117. break;
  4118. default:
  4119. return 0;
  4120. }
  4121. // Truncate and then extend the result.
  4122. SDValue NewExt = DAG.getNode(ISD::TRUNCATE, DL, HalfVT,
  4123. SDValue(IsN0ZExt ? N1 : N0, 0));
  4124. NewExt = DAG.getZExtOrTrunc(NewExt, DL, VT);
  4125. if (IsN0ZExt)
  4126. N1 = NewExt.getNode();
  4127. else
  4128. N0 = NewExt.getNode();
  4129. return AArch64ISD::UMULL;
  4130. }
  4131. }
  4132. if (!IsN1SExt && !IsN1ZExt)
  4133. return 0;
  4134. // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
  4135. // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
  4136. if (IsN1SExt && isAddSubSExt(N0, DAG)) {
  4137. IsMLA = true;
  4138. return AArch64ISD::SMULL;
  4139. }
  4140. if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
  4141. IsMLA = true;
  4142. return AArch64ISD::UMULL;
  4143. }
  4144. if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
  4145. std::swap(N0, N1);
  4146. IsMLA = true;
  4147. return AArch64ISD::UMULL;
  4148. }
  4149. return 0;
  4150. }
  4151. SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
  4152. EVT VT = Op.getValueType();
  4153. // If SVE is available then i64 vector multiplications can also be made legal.
  4154. bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64 ||
  4155. Subtarget->forceStreamingCompatibleSVE();
  4156. if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
  4157. return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
  4158. // Multiplications are only custom-lowered for 128-bit vectors so that
  4159. // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
  4160. assert(VT.is128BitVector() && VT.isInteger() &&
  4161. "unexpected type for custom-lowering ISD::MUL");
  4162. SDNode *N0 = Op.getOperand(0).getNode();
  4163. SDNode *N1 = Op.getOperand(1).getNode();
  4164. bool isMLA = false;
  4165. SDLoc DL(Op);
  4166. unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
  4167. if (!NewOpc) {
  4168. if (VT == MVT::v2i64)
  4169. // Fall through to expand this. It is not legal.
  4170. return SDValue();
  4171. else
  4172. // Other vector multiplications are legal.
  4173. return Op;
  4174. }
  4175. // Legalize to a S/UMULL instruction
  4176. SDValue Op0;
  4177. SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
  4178. if (!isMLA) {
  4179. Op0 = skipExtensionForVectorMULL(N0, DAG);
  4180. assert(Op0.getValueType().is64BitVector() &&
  4181. Op1.getValueType().is64BitVector() &&
  4182. "unexpected types for extended operands to VMULL");
  4183. return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
  4184. }
  4185. // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
  4186. // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
  4187. // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
  4188. SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
  4189. SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
  4190. EVT Op1VT = Op1.getValueType();
  4191. return DAG.getNode(N0->getOpcode(), DL, VT,
  4192. DAG.getNode(NewOpc, DL, VT,
  4193. DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
  4194. DAG.getNode(NewOpc, DL, VT,
  4195. DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
  4196. }
  4197. static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
  4198. int Pattern) {
  4199. if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
  4200. return DAG.getConstant(1, DL, MVT::nxv1i1);
  4201. return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
  4202. DAG.getTargetConstant(Pattern, DL, MVT::i32));
  4203. }
  4204. static SDValue optimizeWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned,
  4205. bool IsLess, bool IsEqual) {
  4206. if (!isa<ConstantSDNode>(Op.getOperand(1)) ||
  4207. !isa<ConstantSDNode>(Op.getOperand(2)))
  4208. return SDValue();
  4209. SDLoc dl(Op);
  4210. APInt X = Op.getConstantOperandAPInt(1);
  4211. APInt Y = Op.getConstantOperandAPInt(2);
  4212. APInt NumActiveElems;
  4213. bool Overflow;
  4214. if (IsLess)
  4215. NumActiveElems = IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
  4216. else
  4217. NumActiveElems = IsSigned ? X.ssub_ov(Y, Overflow) : X.usub_ov(Y, Overflow);
  4218. if (Overflow)
  4219. return SDValue();
  4220. if (IsEqual) {
  4221. APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
  4222. NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
  4223. : NumActiveElems.uadd_ov(One, Overflow);
  4224. if (Overflow)
  4225. return SDValue();
  4226. }
  4227. std::optional<unsigned> PredPattern =
  4228. getSVEPredPatternFromNumElements(NumActiveElems.getZExtValue());
  4229. unsigned MinSVEVectorSize = std::max(
  4230. DAG.getSubtarget<AArch64Subtarget>().getMinSVEVectorSizeInBits(), 128u);
  4231. unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements();
  4232. if (PredPattern != std::nullopt &&
  4233. NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
  4234. return getPTrue(DAG, dl, Op.getValueType(), *PredPattern);
  4235. return SDValue();
  4236. }
  4237. // Returns a safe bitcast between two scalable vector predicates, where
  4238. // any newly created lanes from a widening bitcast are defined as zero.
  4239. static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) {
  4240. SDLoc DL(Op);
  4241. EVT InVT = Op.getValueType();
  4242. assert(InVT.getVectorElementType() == MVT::i1 &&
  4243. VT.getVectorElementType() == MVT::i1 &&
  4244. "Expected a predicate-to-predicate bitcast");
  4245. assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
  4246. InVT.isScalableVector() &&
  4247. DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
  4248. "Only expect to cast between legal scalable predicate types!");
  4249. // Return the operand if the cast isn't changing type,
  4250. // e.g. <n x 16 x i1> -> <n x 16 x i1>
  4251. if (InVT == VT)
  4252. return Op;
  4253. SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
  4254. // We only have to zero the lanes if new lanes are being defined, e.g. when
  4255. // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
  4256. // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
  4257. // we can return here.
  4258. if (InVT.bitsGT(VT))
  4259. return Reinterpret;
  4260. // Check if the other lanes are already known to be zeroed by
  4261. // construction.
  4262. if (isZeroingInactiveLanes(Op))
  4263. return Reinterpret;
  4264. // Zero the newly introduced lanes.
  4265. SDValue Mask = DAG.getConstant(1, DL, InVT);
  4266. Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
  4267. return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
  4268. }
  4269. SDValue AArch64TargetLowering::getPStateSM(SelectionDAG &DAG, SDValue Chain,
  4270. SMEAttrs Attrs, SDLoc DL,
  4271. EVT VT) const {
  4272. if (Attrs.hasStreamingInterfaceOrBody())
  4273. return DAG.getConstant(1, DL, VT);
  4274. if (Attrs.hasNonStreamingInterfaceAndBody())
  4275. return DAG.getConstant(0, DL, VT);
  4276. assert(Attrs.hasStreamingCompatibleInterface() && "Unexpected interface");
  4277. SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
  4278. getPointerTy(DAG.getDataLayout()));
  4279. Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
  4280. Type *RetTy = StructType::get(Int64Ty, Int64Ty);
  4281. TargetLowering::CallLoweringInfo CLI(DAG);
  4282. ArgListTy Args;
  4283. CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
  4284. CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2,
  4285. RetTy, Callee, std::move(Args));
  4286. std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
  4287. SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
  4288. return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
  4289. Mask);
  4290. }
  4291. static std::optional<SMEAttrs> getCalleeAttrsFromExternalFunction(SDValue V) {
  4292. if (auto *ES = dyn_cast<ExternalSymbolSDNode>(V)) {
  4293. StringRef S(ES->getSymbol());
  4294. if (S == "__arm_sme_state" || S == "__arm_tpidr2_save")
  4295. return SMEAttrs(SMEAttrs::SM_Compatible | SMEAttrs::ZA_Preserved);
  4296. if (S == "__arm_tpidr2_restore")
  4297. return SMEAttrs(SMEAttrs::SM_Compatible | SMEAttrs::ZA_Shared);
  4298. }
  4299. return std::nullopt;
  4300. }
  4301. SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
  4302. SelectionDAG &DAG) const {
  4303. unsigned IntNo = Op.getConstantOperandVal(1);
  4304. SDLoc DL(Op);
  4305. switch (IntNo) {
  4306. default:
  4307. return SDValue(); // Don't custom lower most intrinsics.
  4308. case Intrinsic::aarch64_prefetch: {
  4309. SDValue Chain = Op.getOperand(0);
  4310. SDValue Addr = Op.getOperand(2);
  4311. unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
  4312. unsigned Locality = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
  4313. unsigned IsStream = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
  4314. unsigned IsData = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
  4315. unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
  4316. (!IsData << 3) | // IsDataCache bit
  4317. (Locality << 1) | // Cache level bits
  4318. (unsigned)IsStream; // Stream bit
  4319. return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
  4320. DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
  4321. }
  4322. case Intrinsic::aarch64_sme_za_enable:
  4323. return DAG.getNode(
  4324. AArch64ISD::SMSTART, DL, MVT::Other,
  4325. Op->getOperand(0), // Chain
  4326. DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
  4327. DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
  4328. case Intrinsic::aarch64_sme_za_disable:
  4329. return DAG.getNode(
  4330. AArch64ISD::SMSTOP, DL, MVT::Other,
  4331. Op->getOperand(0), // Chain
  4332. DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
  4333. DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
  4334. }
  4335. }
  4336. SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
  4337. SelectionDAG &DAG) const {
  4338. unsigned IntNo = Op.getConstantOperandVal(1);
  4339. SDLoc DL(Op);
  4340. switch (IntNo) {
  4341. default:
  4342. return SDValue(); // Don't custom lower most intrinsics.
  4343. case Intrinsic::aarch64_mops_memset_tag: {
  4344. auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
  4345. SDValue Chain = Node->getChain();
  4346. SDValue Dst = Op.getOperand(2);
  4347. SDValue Val = Op.getOperand(3);
  4348. Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
  4349. SDValue Size = Op.getOperand(4);
  4350. auto Alignment = Node->getMemOperand()->getAlign();
  4351. bool IsVol = Node->isVolatile();
  4352. auto DstPtrInfo = Node->getPointerInfo();
  4353. const auto &SDI =
  4354. static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
  4355. SDValue MS =
  4356. SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val,
  4357. Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{});
  4358. // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
  4359. // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
  4360. // LowerOperationWrapper will complain that the number of results has
  4361. // changed.
  4362. return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
  4363. }
  4364. }
  4365. }
  4366. SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
  4367. SelectionDAG &DAG) const {
  4368. unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
  4369. SDLoc dl(Op);
  4370. switch (IntNo) {
  4371. default: return SDValue(); // Don't custom lower most intrinsics.
  4372. case Intrinsic::thread_pointer: {
  4373. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  4374. return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
  4375. }
  4376. case Intrinsic::aarch64_neon_abs: {
  4377. EVT Ty = Op.getValueType();
  4378. if (Ty == MVT::i64) {
  4379. SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
  4380. Op.getOperand(1));
  4381. Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
  4382. return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
  4383. } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
  4384. return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
  4385. } else {
  4386. report_fatal_error("Unexpected type for AArch64 NEON intrinic");
  4387. }
  4388. }
  4389. case Intrinsic::aarch64_neon_pmull64: {
  4390. SDValue LHS = Op.getOperand(1);
  4391. SDValue RHS = Op.getOperand(2);
  4392. std::optional<uint64_t> LHSLane =
  4393. getConstantLaneNumOfExtractHalfOperand(LHS);
  4394. std::optional<uint64_t> RHSLane =
  4395. getConstantLaneNumOfExtractHalfOperand(RHS);
  4396. assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
  4397. assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
  4398. // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
  4399. // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
  4400. // which ISel recognizes better. For example, generate a ldr into d*
  4401. // registers as opposed to a GPR load followed by a fmov.
  4402. auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
  4403. std::optional<uint64_t> OtherLane,
  4404. const SDLoc &dl,
  4405. SelectionDAG &DAG) -> SDValue {
  4406. // If the operand is an higher half itself, rewrite it to
  4407. // extract_high_v2i64; this way aarch64_neon_pmull64 could
  4408. // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
  4409. if (NLane && *NLane == 1)
  4410. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
  4411. N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
  4412. // Operand N is not a higher half but the other operand is.
  4413. if (OtherLane && *OtherLane == 1) {
  4414. // If this operand is a lower half, rewrite it to
  4415. // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
  4416. // align lanes of two operands. A roundtrip sequence (to move from lane
  4417. // 1 to lane 0) is like this:
  4418. // mov x8, v0.d[1]
  4419. // fmov d0, x8
  4420. if (NLane && *NLane == 0)
  4421. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
  4422. DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
  4423. N.getOperand(0),
  4424. DAG.getConstant(0, dl, MVT::i64)),
  4425. DAG.getConstant(1, dl, MVT::i64));
  4426. // Otherwise just dup from main to all lanes.
  4427. return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
  4428. }
  4429. // Neither operand is an extract of higher half, so codegen may just use
  4430. // the non-high version of PMULL instruction. Use v1i64 to represent i64.
  4431. assert(N.getValueType() == MVT::i64 &&
  4432. "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
  4433. return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
  4434. };
  4435. LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
  4436. RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
  4437. return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS);
  4438. }
  4439. case Intrinsic::aarch64_neon_smax:
  4440. return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
  4441. Op.getOperand(1), Op.getOperand(2));
  4442. case Intrinsic::aarch64_neon_umax:
  4443. return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
  4444. Op.getOperand(1), Op.getOperand(2));
  4445. case Intrinsic::aarch64_neon_smin:
  4446. return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
  4447. Op.getOperand(1), Op.getOperand(2));
  4448. case Intrinsic::aarch64_neon_umin:
  4449. return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
  4450. Op.getOperand(1), Op.getOperand(2));
  4451. case Intrinsic::aarch64_neon_scalar_sqxtn:
  4452. case Intrinsic::aarch64_neon_scalar_sqxtun:
  4453. case Intrinsic::aarch64_neon_scalar_uqxtn: {
  4454. assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
  4455. if (Op.getValueType() == MVT::i32)
  4456. return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
  4457. DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
  4458. Op.getOperand(0),
  4459. DAG.getNode(ISD::BITCAST, dl, MVT::f64,
  4460. Op.getOperand(1))));
  4461. return SDValue();
  4462. }
  4463. case Intrinsic::aarch64_sve_whilelo:
  4464. return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/true,
  4465. /*IsEqual=*/false);
  4466. case Intrinsic::aarch64_sve_whilelt:
  4467. return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/true,
  4468. /*IsEqual=*/false);
  4469. case Intrinsic::aarch64_sve_whilels:
  4470. return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/true,
  4471. /*IsEqual=*/true);
  4472. case Intrinsic::aarch64_sve_whilele:
  4473. return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/true,
  4474. /*IsEqual=*/true);
  4475. case Intrinsic::aarch64_sve_whilege:
  4476. return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/false,
  4477. /*IsEqual=*/true);
  4478. case Intrinsic::aarch64_sve_whilegt:
  4479. return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/false,
  4480. /*IsEqual=*/false);
  4481. case Intrinsic::aarch64_sve_whilehs:
  4482. return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/false,
  4483. /*IsEqual=*/true);
  4484. case Intrinsic::aarch64_sve_whilehi:
  4485. return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/false,
  4486. /*IsEqual=*/false);
  4487. case Intrinsic::aarch64_sve_sunpkhi:
  4488. return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
  4489. Op.getOperand(1));
  4490. case Intrinsic::aarch64_sve_sunpklo:
  4491. return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
  4492. Op.getOperand(1));
  4493. case Intrinsic::aarch64_sve_uunpkhi:
  4494. return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
  4495. Op.getOperand(1));
  4496. case Intrinsic::aarch64_sve_uunpklo:
  4497. return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
  4498. Op.getOperand(1));
  4499. case Intrinsic::aarch64_sve_clasta_n:
  4500. return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
  4501. Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
  4502. case Intrinsic::aarch64_sve_clastb_n:
  4503. return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
  4504. Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
  4505. case Intrinsic::aarch64_sve_lasta:
  4506. return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
  4507. Op.getOperand(1), Op.getOperand(2));
  4508. case Intrinsic::aarch64_sve_lastb:
  4509. return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
  4510. Op.getOperand(1), Op.getOperand(2));
  4511. case Intrinsic::aarch64_sve_rev:
  4512. return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
  4513. Op.getOperand(1));
  4514. case Intrinsic::aarch64_sve_tbl:
  4515. return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
  4516. Op.getOperand(1), Op.getOperand(2));
  4517. case Intrinsic::aarch64_sve_trn1:
  4518. return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
  4519. Op.getOperand(1), Op.getOperand(2));
  4520. case Intrinsic::aarch64_sve_trn2:
  4521. return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
  4522. Op.getOperand(1), Op.getOperand(2));
  4523. case Intrinsic::aarch64_sve_uzp1:
  4524. return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
  4525. Op.getOperand(1), Op.getOperand(2));
  4526. case Intrinsic::aarch64_sve_uzp2:
  4527. return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
  4528. Op.getOperand(1), Op.getOperand(2));
  4529. case Intrinsic::aarch64_sve_zip1:
  4530. return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
  4531. Op.getOperand(1), Op.getOperand(2));
  4532. case Intrinsic::aarch64_sve_zip2:
  4533. return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
  4534. Op.getOperand(1), Op.getOperand(2));
  4535. case Intrinsic::aarch64_sve_splice:
  4536. return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
  4537. Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
  4538. case Intrinsic::aarch64_sve_ptrue:
  4539. return getPTrue(DAG, dl, Op.getValueType(),
  4540. cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
  4541. case Intrinsic::aarch64_sve_clz:
  4542. return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
  4543. Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
  4544. case Intrinsic::aarch64_sme_cntsb:
  4545. return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
  4546. DAG.getConstant(1, dl, MVT::i32));
  4547. case Intrinsic::aarch64_sme_cntsh: {
  4548. SDValue One = DAG.getConstant(1, dl, MVT::i32);
  4549. SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
  4550. return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
  4551. }
  4552. case Intrinsic::aarch64_sme_cntsw: {
  4553. SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
  4554. DAG.getConstant(1, dl, MVT::i32));
  4555. return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
  4556. DAG.getConstant(2, dl, MVT::i32));
  4557. }
  4558. case Intrinsic::aarch64_sme_cntsd: {
  4559. SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
  4560. DAG.getConstant(1, dl, MVT::i32));
  4561. return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
  4562. DAG.getConstant(3, dl, MVT::i32));
  4563. }
  4564. case Intrinsic::aarch64_sve_cnt: {
  4565. SDValue Data = Op.getOperand(3);
  4566. // CTPOP only supports integer operands.
  4567. if (Data.getValueType().isFloatingPoint())
  4568. Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
  4569. return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
  4570. Op.getOperand(2), Data, Op.getOperand(1));
  4571. }
  4572. case Intrinsic::aarch64_sve_dupq_lane:
  4573. return LowerDUPQLane(Op, DAG);
  4574. case Intrinsic::aarch64_sve_convert_from_svbool:
  4575. return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
  4576. case Intrinsic::aarch64_sve_convert_to_svbool:
  4577. return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
  4578. case Intrinsic::aarch64_sve_fneg:
  4579. return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
  4580. Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
  4581. case Intrinsic::aarch64_sve_frintp:
  4582. return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
  4583. Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
  4584. case Intrinsic::aarch64_sve_frintm:
  4585. return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
  4586. Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
  4587. case Intrinsic::aarch64_sve_frinti:
  4588. return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
  4589. Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
  4590. case Intrinsic::aarch64_sve_frintx:
  4591. return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
  4592. Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
  4593. case Intrinsic::aarch64_sve_frinta:
  4594. return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
  4595. Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
  4596. case Intrinsic::aarch64_sve_frintn:
  4597. return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
  4598. Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
  4599. case Intrinsic::aarch64_sve_frintz:
  4600. return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
  4601. Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
  4602. case Intrinsic::aarch64_sve_ucvtf:
  4603. return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, dl,
  4604. Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
  4605. Op.getOperand(1));
  4606. case Intrinsic::aarch64_sve_scvtf:
  4607. return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, dl,
  4608. Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
  4609. Op.getOperand(1));
  4610. case Intrinsic::aarch64_sve_fcvtzu:
  4611. return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, dl,
  4612. Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
  4613. Op.getOperand(1));
  4614. case Intrinsic::aarch64_sve_fcvtzs:
  4615. return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, dl,
  4616. Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
  4617. Op.getOperand(1));
  4618. case Intrinsic::aarch64_sve_fsqrt:
  4619. return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
  4620. Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
  4621. case Intrinsic::aarch64_sve_frecpx:
  4622. return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
  4623. Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
  4624. case Intrinsic::aarch64_sve_frecpe_x:
  4625. return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
  4626. Op.getOperand(1));
  4627. case Intrinsic::aarch64_sve_frecps_x:
  4628. return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
  4629. Op.getOperand(1), Op.getOperand(2));
  4630. case Intrinsic::aarch64_sve_frsqrte_x:
  4631. return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
  4632. Op.getOperand(1));
  4633. case Intrinsic::aarch64_sve_frsqrts_x:
  4634. return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
  4635. Op.getOperand(1), Op.getOperand(2));
  4636. case Intrinsic::aarch64_sve_fabs:
  4637. return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
  4638. Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
  4639. case Intrinsic::aarch64_sve_abs:
  4640. return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
  4641. Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
  4642. case Intrinsic::aarch64_sve_neg:
  4643. return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
  4644. Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
  4645. case Intrinsic::aarch64_sve_insr: {
  4646. SDValue Scalar = Op.getOperand(2);
  4647. EVT ScalarTy = Scalar.getValueType();
  4648. if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
  4649. Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
  4650. return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
  4651. Op.getOperand(1), Scalar);
  4652. }
  4653. case Intrinsic::aarch64_sve_rbit:
  4654. return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, dl,
  4655. Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
  4656. Op.getOperand(1));
  4657. case Intrinsic::aarch64_sve_revb:
  4658. return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
  4659. Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
  4660. case Intrinsic::aarch64_sve_revh:
  4661. return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
  4662. Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
  4663. case Intrinsic::aarch64_sve_revw:
  4664. return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
  4665. Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
  4666. case Intrinsic::aarch64_sve_revd:
  4667. return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
  4668. Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
  4669. case Intrinsic::aarch64_sve_sxtb:
  4670. return DAG.getNode(
  4671. AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
  4672. Op.getOperand(2), Op.getOperand(3),
  4673. DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
  4674. Op.getOperand(1));
  4675. case Intrinsic::aarch64_sve_sxth:
  4676. return DAG.getNode(
  4677. AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
  4678. Op.getOperand(2), Op.getOperand(3),
  4679. DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
  4680. Op.getOperand(1));
  4681. case Intrinsic::aarch64_sve_sxtw:
  4682. return DAG.getNode(
  4683. AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
  4684. Op.getOperand(2), Op.getOperand(3),
  4685. DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
  4686. Op.getOperand(1));
  4687. case Intrinsic::aarch64_sve_uxtb:
  4688. return DAG.getNode(
  4689. AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
  4690. Op.getOperand(2), Op.getOperand(3),
  4691. DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
  4692. Op.getOperand(1));
  4693. case Intrinsic::aarch64_sve_uxth:
  4694. return DAG.getNode(
  4695. AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
  4696. Op.getOperand(2), Op.getOperand(3),
  4697. DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
  4698. Op.getOperand(1));
  4699. case Intrinsic::aarch64_sve_uxtw:
  4700. return DAG.getNode(
  4701. AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
  4702. Op.getOperand(2), Op.getOperand(3),
  4703. DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
  4704. Op.getOperand(1));
  4705. case Intrinsic::localaddress: {
  4706. const auto &MF = DAG.getMachineFunction();
  4707. const auto *RegInfo = Subtarget->getRegisterInfo();
  4708. unsigned Reg = RegInfo->getLocalAddressRegister(MF);
  4709. return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
  4710. Op.getSimpleValueType());
  4711. }
  4712. case Intrinsic::eh_recoverfp: {
  4713. // FIXME: This needs to be implemented to correctly handle highly aligned
  4714. // stack objects. For now we simply return the incoming FP. Refer D53541
  4715. // for more details.
  4716. SDValue FnOp = Op.getOperand(1);
  4717. SDValue IncomingFPOp = Op.getOperand(2);
  4718. GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
  4719. auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
  4720. if (!Fn)
  4721. report_fatal_error(
  4722. "llvm.eh.recoverfp must take a function as the first argument");
  4723. return IncomingFPOp;
  4724. }
  4725. case Intrinsic::aarch64_neon_vsri:
  4726. case Intrinsic::aarch64_neon_vsli: {
  4727. EVT Ty = Op.getValueType();
  4728. if (!Ty.isVector())
  4729. report_fatal_error("Unexpected type for aarch64_neon_vsli");
  4730. assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
  4731. bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri;
  4732. unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
  4733. return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
  4734. Op.getOperand(3));
  4735. }
  4736. case Intrinsic::aarch64_neon_srhadd:
  4737. case Intrinsic::aarch64_neon_urhadd:
  4738. case Intrinsic::aarch64_neon_shadd:
  4739. case Intrinsic::aarch64_neon_uhadd: {
  4740. bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
  4741. IntNo == Intrinsic::aarch64_neon_shadd);
  4742. bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
  4743. IntNo == Intrinsic::aarch64_neon_urhadd);
  4744. unsigned Opcode = IsSignedAdd
  4745. ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
  4746. : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
  4747. return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
  4748. Op.getOperand(2));
  4749. }
  4750. case Intrinsic::aarch64_neon_sabd:
  4751. case Intrinsic::aarch64_neon_uabd: {
  4752. unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uabd ? ISD::ABDU
  4753. : ISD::ABDS;
  4754. return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
  4755. Op.getOperand(2));
  4756. }
  4757. case Intrinsic::aarch64_neon_saddlp:
  4758. case Intrinsic::aarch64_neon_uaddlp: {
  4759. unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
  4760. ? AArch64ISD::UADDLP
  4761. : AArch64ISD::SADDLP;
  4762. return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
  4763. }
  4764. case Intrinsic::aarch64_neon_sdot:
  4765. case Intrinsic::aarch64_neon_udot:
  4766. case Intrinsic::aarch64_sve_sdot:
  4767. case Intrinsic::aarch64_sve_udot: {
  4768. unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
  4769. IntNo == Intrinsic::aarch64_sve_udot)
  4770. ? AArch64ISD::UDOT
  4771. : AArch64ISD::SDOT;
  4772. return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
  4773. Op.getOperand(2), Op.getOperand(3));
  4774. }
  4775. case Intrinsic::get_active_lane_mask: {
  4776. SDValue ID =
  4777. DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
  4778. return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID,
  4779. Op.getOperand(1), Op.getOperand(2));
  4780. }
  4781. }
  4782. }
  4783. bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
  4784. if (VT.getVectorElementType() == MVT::i8 ||
  4785. VT.getVectorElementType() == MVT::i16) {
  4786. EltTy = MVT::i32;
  4787. return true;
  4788. }
  4789. return false;
  4790. }
  4791. bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT IndexVT,
  4792. EVT DataVT) const {
  4793. // SVE only supports implicit extension of 32-bit indices.
  4794. if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
  4795. return false;
  4796. // Indices cannot be smaller than the main data type.
  4797. if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
  4798. return false;
  4799. // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
  4800. // element container type, which would violate the previous clause.
  4801. return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
  4802. }
  4803. bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
  4804. return ExtVal.getValueType().isScalableVector() ||
  4805. useSVEForFixedLengthVectorVT(
  4806. ExtVal.getValueType(),
  4807. /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors());
  4808. }
  4809. unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
  4810. std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
  4811. {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
  4812. AArch64ISD::GLD1_MERGE_ZERO},
  4813. {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
  4814. AArch64ISD::GLD1_UXTW_MERGE_ZERO},
  4815. {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
  4816. AArch64ISD::GLD1_MERGE_ZERO},
  4817. {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
  4818. AArch64ISD::GLD1_SXTW_MERGE_ZERO},
  4819. {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
  4820. AArch64ISD::GLD1_SCALED_MERGE_ZERO},
  4821. {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
  4822. AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
  4823. {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
  4824. AArch64ISD::GLD1_SCALED_MERGE_ZERO},
  4825. {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
  4826. AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
  4827. };
  4828. auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
  4829. return AddrModes.find(Key)->second;
  4830. }
  4831. unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
  4832. switch (Opcode) {
  4833. default:
  4834. llvm_unreachable("unimplemented opcode");
  4835. return Opcode;
  4836. case AArch64ISD::GLD1_MERGE_ZERO:
  4837. return AArch64ISD::GLD1S_MERGE_ZERO;
  4838. case AArch64ISD::GLD1_IMM_MERGE_ZERO:
  4839. return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
  4840. case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
  4841. return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
  4842. case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
  4843. return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
  4844. case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
  4845. return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
  4846. case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
  4847. return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
  4848. case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
  4849. return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
  4850. }
  4851. }
  4852. SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
  4853. SelectionDAG &DAG) const {
  4854. MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
  4855. SDLoc DL(Op);
  4856. SDValue Chain = MGT->getChain();
  4857. SDValue PassThru = MGT->getPassThru();
  4858. SDValue Mask = MGT->getMask();
  4859. SDValue BasePtr = MGT->getBasePtr();
  4860. SDValue Index = MGT->getIndex();
  4861. SDValue Scale = MGT->getScale();
  4862. EVT VT = Op.getValueType();
  4863. EVT MemVT = MGT->getMemoryVT();
  4864. ISD::LoadExtType ExtType = MGT->getExtensionType();
  4865. ISD::MemIndexType IndexType = MGT->getIndexType();
  4866. // SVE supports zero (and so undef) passthrough values only, everything else
  4867. // must be handled manually by an explicit select on the load's output.
  4868. if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
  4869. SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
  4870. SDValue Load =
  4871. DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
  4872. MGT->getMemOperand(), IndexType, ExtType);
  4873. SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
  4874. return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
  4875. }
  4876. bool IsScaled = MGT->isIndexScaled();
  4877. bool IsSigned = MGT->isIndexSigned();
  4878. // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
  4879. // must be calculated before hand.
  4880. uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue();
  4881. if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
  4882. assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
  4883. EVT IndexVT = Index.getValueType();
  4884. Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
  4885. DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
  4886. Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
  4887. SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
  4888. return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
  4889. MGT->getMemOperand(), IndexType, ExtType);
  4890. }
  4891. // Lower fixed length gather to a scalable equivalent.
  4892. if (VT.isFixedLengthVector()) {
  4893. assert(Subtarget->useSVEForFixedLengthVectors() &&
  4894. "Cannot lower when not using SVE for fixed vectors!");
  4895. // NOTE: Handle floating-point as if integer then bitcast the result.
  4896. EVT DataVT = VT.changeVectorElementTypeToInteger();
  4897. MemVT = MemVT.changeVectorElementTypeToInteger();
  4898. // Find the smallest integer fixed length vector we can use for the gather.
  4899. EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
  4900. if (DataVT.getVectorElementType() == MVT::i64 ||
  4901. Index.getValueType().getVectorElementType() == MVT::i64 ||
  4902. Mask.getValueType().getVectorElementType() == MVT::i64)
  4903. PromotedVT = VT.changeVectorElementType(MVT::i64);
  4904. // Promote vector operands except for passthrough, which we know is either
  4905. // undef or zero, and thus best constructed directly.
  4906. unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
  4907. Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
  4908. Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
  4909. // A promoted result type forces the need for an extending load.
  4910. if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
  4911. ExtType = ISD::EXTLOAD;
  4912. EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
  4913. // Convert fixed length vector operands to scalable.
  4914. MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
  4915. Index = convertToScalableVector(DAG, ContainerVT, Index);
  4916. Mask = convertFixedMaskToScalableVector(Mask, DAG);
  4917. PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
  4918. : DAG.getConstant(0, DL, ContainerVT);
  4919. // Emit equivalent scalable vector gather.
  4920. SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
  4921. SDValue Load =
  4922. DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
  4923. Ops, MGT->getMemOperand(), IndexType, ExtType);
  4924. // Extract fixed length data then convert to the required result type.
  4925. SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
  4926. Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
  4927. if (VT.isFloatingPoint())
  4928. Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
  4929. return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
  4930. }
  4931. // Everything else is legal.
  4932. return Op;
  4933. }
  4934. SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
  4935. SelectionDAG &DAG) const {
  4936. MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
  4937. SDLoc DL(Op);
  4938. SDValue Chain = MSC->getChain();
  4939. SDValue StoreVal = MSC->getValue();
  4940. SDValue Mask = MSC->getMask();
  4941. SDValue BasePtr = MSC->getBasePtr();
  4942. SDValue Index = MSC->getIndex();
  4943. SDValue Scale = MSC->getScale();
  4944. EVT VT = StoreVal.getValueType();
  4945. EVT MemVT = MSC->getMemoryVT();
  4946. ISD::MemIndexType IndexType = MSC->getIndexType();
  4947. bool Truncating = MSC->isTruncatingStore();
  4948. bool IsScaled = MSC->isIndexScaled();
  4949. bool IsSigned = MSC->isIndexSigned();
  4950. // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
  4951. // must be calculated before hand.
  4952. uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue();
  4953. if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
  4954. assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
  4955. EVT IndexVT = Index.getValueType();
  4956. Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
  4957. DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
  4958. Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
  4959. SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
  4960. return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
  4961. MSC->getMemOperand(), IndexType, Truncating);
  4962. }
  4963. // Lower fixed length scatter to a scalable equivalent.
  4964. if (VT.isFixedLengthVector()) {
  4965. assert(Subtarget->useSVEForFixedLengthVectors() &&
  4966. "Cannot lower when not using SVE for fixed vectors!");
  4967. // Once bitcast we treat floating-point scatters as if integer.
  4968. if (VT.isFloatingPoint()) {
  4969. VT = VT.changeVectorElementTypeToInteger();
  4970. MemVT = MemVT.changeVectorElementTypeToInteger();
  4971. StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
  4972. }
  4973. // Find the smallest integer fixed length vector we can use for the scatter.
  4974. EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
  4975. if (VT.getVectorElementType() == MVT::i64 ||
  4976. Index.getValueType().getVectorElementType() == MVT::i64 ||
  4977. Mask.getValueType().getVectorElementType() == MVT::i64)
  4978. PromotedVT = VT.changeVectorElementType(MVT::i64);
  4979. // Promote vector operands.
  4980. unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
  4981. Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
  4982. Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
  4983. StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
  4984. // A promoted value type forces the need for a truncating store.
  4985. if (PromotedVT != VT)
  4986. Truncating = true;
  4987. EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
  4988. // Convert fixed length vector operands to scalable.
  4989. MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
  4990. Index = convertToScalableVector(DAG, ContainerVT, Index);
  4991. Mask = convertFixedMaskToScalableVector(Mask, DAG);
  4992. StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
  4993. // Emit equivalent scalable vector scatter.
  4994. SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
  4995. return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
  4996. MSC->getMemOperand(), IndexType, Truncating);
  4997. }
  4998. // Everything else is legal.
  4999. return Op;
  5000. }
  5001. SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
  5002. SDLoc DL(Op);
  5003. MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
  5004. assert(LoadNode && "Expected custom lowering of a masked load node");
  5005. EVT VT = Op->getValueType(0);
  5006. if (useSVEForFixedLengthVectorVT(
  5007. VT,
  5008. /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
  5009. return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
  5010. SDValue PassThru = LoadNode->getPassThru();
  5011. SDValue Mask = LoadNode->getMask();
  5012. if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
  5013. return Op;
  5014. SDValue Load = DAG.getMaskedLoad(
  5015. VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
  5016. LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
  5017. LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
  5018. LoadNode->getExtensionType());
  5019. SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
  5020. return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
  5021. }
  5022. // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
  5023. static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
  5024. EVT VT, EVT MemVT,
  5025. SelectionDAG &DAG) {
  5026. assert(VT.isVector() && "VT should be a vector type");
  5027. assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
  5028. SDValue Value = ST->getValue();
  5029. // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
  5030. // the word lane which represent the v4i8 subvector. It optimizes the store
  5031. // to:
  5032. //
  5033. // xtn v0.8b, v0.8h
  5034. // str s0, [x0]
  5035. SDValue Undef = DAG.getUNDEF(MVT::i16);
  5036. SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
  5037. {Undef, Undef, Undef, Undef});
  5038. SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
  5039. Value, UndefVec);
  5040. SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
  5041. Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
  5042. SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
  5043. Trunc, DAG.getConstant(0, DL, MVT::i64));
  5044. return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
  5045. ST->getBasePtr(), ST->getMemOperand());
  5046. }
  5047. // Custom lowering for any store, vector or scalar and/or default or with
  5048. // a truncate operations. Currently only custom lower truncate operation
  5049. // from vector v4i16 to v4i8 or volatile stores of i128.
  5050. SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
  5051. SelectionDAG &DAG) const {
  5052. SDLoc Dl(Op);
  5053. StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
  5054. assert (StoreNode && "Can only custom lower store nodes");
  5055. SDValue Value = StoreNode->getValue();
  5056. EVT VT = Value.getValueType();
  5057. EVT MemVT = StoreNode->getMemoryVT();
  5058. if (VT.isVector()) {
  5059. if (useSVEForFixedLengthVectorVT(
  5060. VT,
  5061. /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
  5062. return LowerFixedLengthVectorStoreToSVE(Op, DAG);
  5063. unsigned AS = StoreNode->getAddressSpace();
  5064. Align Alignment = StoreNode->getAlign();
  5065. if (Alignment < MemVT.getStoreSize() &&
  5066. !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
  5067. StoreNode->getMemOperand()->getFlags(),
  5068. nullptr)) {
  5069. return scalarizeVectorStore(StoreNode, DAG);
  5070. }
  5071. if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
  5072. MemVT == MVT::v4i8) {
  5073. return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
  5074. }
  5075. // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
  5076. // the custom lowering, as there are no un-paired non-temporal stores and
  5077. // legalization will break up 256 bit inputs.
  5078. ElementCount EC = MemVT.getVectorElementCount();
  5079. if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
  5080. EC.isKnownEven() &&
  5081. ((MemVT.getScalarSizeInBits() == 8u ||
  5082. MemVT.getScalarSizeInBits() == 16u ||
  5083. MemVT.getScalarSizeInBits() == 32u ||
  5084. MemVT.getScalarSizeInBits() == 64u))) {
  5085. SDValue Lo =
  5086. DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
  5087. MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
  5088. StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
  5089. SDValue Hi =
  5090. DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
  5091. MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
  5092. StoreNode->getValue(),
  5093. DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
  5094. SDValue Result = DAG.getMemIntrinsicNode(
  5095. AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
  5096. {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
  5097. StoreNode->getMemoryVT(), StoreNode->getMemOperand());
  5098. return Result;
  5099. }
  5100. } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
  5101. return LowerStore128(Op, DAG);
  5102. } else if (MemVT == MVT::i64x8) {
  5103. SDValue Value = StoreNode->getValue();
  5104. assert(Value->getValueType(0) == MVT::i64x8);
  5105. SDValue Chain = StoreNode->getChain();
  5106. SDValue Base = StoreNode->getBasePtr();
  5107. EVT PtrVT = Base.getValueType();
  5108. for (unsigned i = 0; i < 8; i++) {
  5109. SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
  5110. Value, DAG.getConstant(i, Dl, MVT::i32));
  5111. SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
  5112. DAG.getConstant(i * 8, Dl, PtrVT));
  5113. Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
  5114. StoreNode->getOriginalAlign());
  5115. }
  5116. return Chain;
  5117. }
  5118. return SDValue();
  5119. }
  5120. /// Lower atomic or volatile 128-bit stores to a single STP instruction.
  5121. SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
  5122. SelectionDAG &DAG) const {
  5123. MemSDNode *StoreNode = cast<MemSDNode>(Op);
  5124. assert(StoreNode->getMemoryVT() == MVT::i128);
  5125. assert(StoreNode->isVolatile() || StoreNode->isAtomic());
  5126. assert(!StoreNode->isAtomic() ||
  5127. StoreNode->getMergedOrdering() == AtomicOrdering::Unordered ||
  5128. StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic);
  5129. SDValue Value = StoreNode->getOpcode() == ISD::STORE
  5130. ? StoreNode->getOperand(1)
  5131. : StoreNode->getOperand(2);
  5132. SDLoc DL(Op);
  5133. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value,
  5134. DAG.getConstant(0, DL, MVT::i64));
  5135. SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value,
  5136. DAG.getConstant(1, DL, MVT::i64));
  5137. SDValue Result = DAG.getMemIntrinsicNode(
  5138. AArch64ISD::STP, DL, DAG.getVTList(MVT::Other),
  5139. {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
  5140. StoreNode->getMemoryVT(), StoreNode->getMemOperand());
  5141. return Result;
  5142. }
  5143. SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
  5144. SelectionDAG &DAG) const {
  5145. SDLoc DL(Op);
  5146. LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
  5147. assert(LoadNode && "Expected custom lowering of a load node");
  5148. if (LoadNode->getMemoryVT() == MVT::i64x8) {
  5149. SmallVector<SDValue, 8> Ops;
  5150. SDValue Base = LoadNode->getBasePtr();
  5151. SDValue Chain = LoadNode->getChain();
  5152. EVT PtrVT = Base.getValueType();
  5153. for (unsigned i = 0; i < 8; i++) {
  5154. SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
  5155. DAG.getConstant(i * 8, DL, PtrVT));
  5156. SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
  5157. LoadNode->getPointerInfo(),
  5158. LoadNode->getOriginalAlign());
  5159. Ops.push_back(Part);
  5160. Chain = SDValue(Part.getNode(), 1);
  5161. }
  5162. SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
  5163. return DAG.getMergeValues({Loaded, Chain}, DL);
  5164. }
  5165. // Custom lowering for extending v4i8 vector loads.
  5166. EVT VT = Op->getValueType(0);
  5167. assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
  5168. if (LoadNode->getMemoryVT() != MVT::v4i8)
  5169. return SDValue();
  5170. unsigned ExtType;
  5171. if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
  5172. ExtType = ISD::SIGN_EXTEND;
  5173. else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
  5174. LoadNode->getExtensionType() == ISD::EXTLOAD)
  5175. ExtType = ISD::ZERO_EXTEND;
  5176. else
  5177. return SDValue();
  5178. SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
  5179. LoadNode->getBasePtr(), MachinePointerInfo());
  5180. SDValue Chain = Load.getValue(1);
  5181. SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
  5182. SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
  5183. SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
  5184. Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
  5185. DAG.getConstant(0, DL, MVT::i64));
  5186. if (VT == MVT::v4i32)
  5187. Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
  5188. return DAG.getMergeValues({Ext, Chain}, DL);
  5189. }
  5190. // Generate SUBS and CSEL for integer abs.
  5191. SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
  5192. MVT VT = Op.getSimpleValueType();
  5193. if (VT.isVector())
  5194. return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
  5195. SDLoc DL(Op);
  5196. SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
  5197. Op.getOperand(0));
  5198. // Generate SUBS & CSEL.
  5199. SDValue Cmp =
  5200. DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
  5201. Op.getOperand(0), DAG.getConstant(0, DL, VT));
  5202. return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
  5203. DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
  5204. Cmp.getValue(1));
  5205. }
  5206. static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
  5207. SDValue Chain = Op.getOperand(0);
  5208. SDValue Cond = Op.getOperand(1);
  5209. SDValue Dest = Op.getOperand(2);
  5210. AArch64CC::CondCode CC;
  5211. if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
  5212. SDLoc dl(Op);
  5213. SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
  5214. return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
  5215. Cmp);
  5216. }
  5217. return SDValue();
  5218. }
  5219. SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
  5220. SelectionDAG &DAG) const {
  5221. LLVM_DEBUG(dbgs() << "Custom lowering: ");
  5222. LLVM_DEBUG(Op.dump());
  5223. switch (Op.getOpcode()) {
  5224. default:
  5225. llvm_unreachable("unimplemented operand");
  5226. return SDValue();
  5227. case ISD::BITCAST:
  5228. return LowerBITCAST(Op, DAG);
  5229. case ISD::GlobalAddress:
  5230. return LowerGlobalAddress(Op, DAG);
  5231. case ISD::GlobalTLSAddress:
  5232. return LowerGlobalTLSAddress(Op, DAG);
  5233. case ISD::SETCC:
  5234. case ISD::STRICT_FSETCC:
  5235. case ISD::STRICT_FSETCCS:
  5236. return LowerSETCC(Op, DAG);
  5237. case ISD::SETCCCARRY:
  5238. return LowerSETCCCARRY(Op, DAG);
  5239. case ISD::BRCOND:
  5240. return LowerBRCOND(Op, DAG);
  5241. case ISD::BR_CC:
  5242. return LowerBR_CC(Op, DAG);
  5243. case ISD::SELECT:
  5244. return LowerSELECT(Op, DAG);
  5245. case ISD::SELECT_CC:
  5246. return LowerSELECT_CC(Op, DAG);
  5247. case ISD::JumpTable:
  5248. return LowerJumpTable(Op, DAG);
  5249. case ISD::BR_JT:
  5250. return LowerBR_JT(Op, DAG);
  5251. case ISD::ConstantPool:
  5252. return LowerConstantPool(Op, DAG);
  5253. case ISD::BlockAddress:
  5254. return LowerBlockAddress(Op, DAG);
  5255. case ISD::VASTART:
  5256. return LowerVASTART(Op, DAG);
  5257. case ISD::VACOPY:
  5258. return LowerVACOPY(Op, DAG);
  5259. case ISD::VAARG:
  5260. return LowerVAARG(Op, DAG);
  5261. case ISD::ADDCARRY:
  5262. return lowerADDSUBCARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
  5263. case ISD::SUBCARRY:
  5264. return lowerADDSUBCARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
  5265. case ISD::SADDO_CARRY:
  5266. return lowerADDSUBCARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
  5267. case ISD::SSUBO_CARRY:
  5268. return lowerADDSUBCARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
  5269. case ISD::SADDO:
  5270. case ISD::UADDO:
  5271. case ISD::SSUBO:
  5272. case ISD::USUBO:
  5273. case ISD::SMULO:
  5274. case ISD::UMULO:
  5275. return LowerXALUO(Op, DAG);
  5276. case ISD::FADD:
  5277. return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
  5278. case ISD::FSUB:
  5279. return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
  5280. case ISD::FMUL:
  5281. return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
  5282. case ISD::FMA:
  5283. return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
  5284. case ISD::FDIV:
  5285. return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
  5286. case ISD::FNEG:
  5287. return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
  5288. case ISD::FCEIL:
  5289. return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
  5290. case ISD::FFLOOR:
  5291. return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
  5292. case ISD::FNEARBYINT:
  5293. return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
  5294. case ISD::FRINT:
  5295. return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
  5296. case ISD::FROUND:
  5297. return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
  5298. case ISD::FROUNDEVEN:
  5299. return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
  5300. case ISD::FTRUNC:
  5301. return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
  5302. case ISD::FSQRT:
  5303. return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
  5304. case ISD::FABS:
  5305. return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
  5306. case ISD::FP_ROUND:
  5307. case ISD::STRICT_FP_ROUND:
  5308. return LowerFP_ROUND(Op, DAG);
  5309. case ISD::FP_EXTEND:
  5310. return LowerFP_EXTEND(Op, DAG);
  5311. case ISD::FRAMEADDR:
  5312. return LowerFRAMEADDR(Op, DAG);
  5313. case ISD::SPONENTRY:
  5314. return LowerSPONENTRY(Op, DAG);
  5315. case ISD::RETURNADDR:
  5316. return LowerRETURNADDR(Op, DAG);
  5317. case ISD::ADDROFRETURNADDR:
  5318. return LowerADDROFRETURNADDR(Op, DAG);
  5319. case ISD::CONCAT_VECTORS:
  5320. return LowerCONCAT_VECTORS(Op, DAG);
  5321. case ISD::INSERT_VECTOR_ELT:
  5322. return LowerINSERT_VECTOR_ELT(Op, DAG);
  5323. case ISD::EXTRACT_VECTOR_ELT:
  5324. return LowerEXTRACT_VECTOR_ELT(Op, DAG);
  5325. case ISD::BUILD_VECTOR:
  5326. return LowerBUILD_VECTOR(Op, DAG);
  5327. case ISD::ZERO_EXTEND_VECTOR_INREG:
  5328. return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
  5329. case ISD::VECTOR_SHUFFLE:
  5330. return LowerVECTOR_SHUFFLE(Op, DAG);
  5331. case ISD::SPLAT_VECTOR:
  5332. return LowerSPLAT_VECTOR(Op, DAG);
  5333. case ISD::EXTRACT_SUBVECTOR:
  5334. return LowerEXTRACT_SUBVECTOR(Op, DAG);
  5335. case ISD::INSERT_SUBVECTOR:
  5336. return LowerINSERT_SUBVECTOR(Op, DAG);
  5337. case ISD::SDIV:
  5338. case ISD::UDIV:
  5339. return LowerDIV(Op, DAG);
  5340. case ISD::SMIN:
  5341. case ISD::UMIN:
  5342. case ISD::SMAX:
  5343. case ISD::UMAX:
  5344. return LowerMinMax(Op, DAG);
  5345. case ISD::SRA:
  5346. case ISD::SRL:
  5347. case ISD::SHL:
  5348. return LowerVectorSRA_SRL_SHL(Op, DAG);
  5349. case ISD::SHL_PARTS:
  5350. case ISD::SRL_PARTS:
  5351. case ISD::SRA_PARTS:
  5352. return LowerShiftParts(Op, DAG);
  5353. case ISD::CTPOP:
  5354. case ISD::PARITY:
  5355. return LowerCTPOP_PARITY(Op, DAG);
  5356. case ISD::FCOPYSIGN:
  5357. return LowerFCOPYSIGN(Op, DAG);
  5358. case ISD::OR:
  5359. return LowerVectorOR(Op, DAG);
  5360. case ISD::XOR:
  5361. return LowerXOR(Op, DAG);
  5362. case ISD::PREFETCH:
  5363. return LowerPREFETCH(Op, DAG);
  5364. case ISD::SINT_TO_FP:
  5365. case ISD::UINT_TO_FP:
  5366. case ISD::STRICT_SINT_TO_FP:
  5367. case ISD::STRICT_UINT_TO_FP:
  5368. return LowerINT_TO_FP(Op, DAG);
  5369. case ISD::FP_TO_SINT:
  5370. case ISD::FP_TO_UINT:
  5371. case ISD::STRICT_FP_TO_SINT:
  5372. case ISD::STRICT_FP_TO_UINT:
  5373. return LowerFP_TO_INT(Op, DAG);
  5374. case ISD::FP_TO_SINT_SAT:
  5375. case ISD::FP_TO_UINT_SAT:
  5376. return LowerFP_TO_INT_SAT(Op, DAG);
  5377. case ISD::FSINCOS:
  5378. return LowerFSINCOS(Op, DAG);
  5379. case ISD::GET_ROUNDING:
  5380. return LowerGET_ROUNDING(Op, DAG);
  5381. case ISD::SET_ROUNDING:
  5382. return LowerSET_ROUNDING(Op, DAG);
  5383. case ISD::MUL:
  5384. return LowerMUL(Op, DAG);
  5385. case ISD::MULHS:
  5386. return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
  5387. case ISD::MULHU:
  5388. return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
  5389. case ISD::INTRINSIC_W_CHAIN:
  5390. return LowerINTRINSIC_W_CHAIN(Op, DAG);
  5391. case ISD::INTRINSIC_WO_CHAIN:
  5392. return LowerINTRINSIC_WO_CHAIN(Op, DAG);
  5393. case ISD::INTRINSIC_VOID:
  5394. return LowerINTRINSIC_VOID(Op, DAG);
  5395. case ISD::ATOMIC_STORE:
  5396. if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
  5397. assert(Subtarget->hasLSE2());
  5398. return LowerStore128(Op, DAG);
  5399. }
  5400. return SDValue();
  5401. case ISD::STORE:
  5402. return LowerSTORE(Op, DAG);
  5403. case ISD::MSTORE:
  5404. return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
  5405. case ISD::MGATHER:
  5406. return LowerMGATHER(Op, DAG);
  5407. case ISD::MSCATTER:
  5408. return LowerMSCATTER(Op, DAG);
  5409. case ISD::VECREDUCE_SEQ_FADD:
  5410. return LowerVECREDUCE_SEQ_FADD(Op, DAG);
  5411. case ISD::VECREDUCE_ADD:
  5412. case ISD::VECREDUCE_AND:
  5413. case ISD::VECREDUCE_OR:
  5414. case ISD::VECREDUCE_XOR:
  5415. case ISD::VECREDUCE_SMAX:
  5416. case ISD::VECREDUCE_SMIN:
  5417. case ISD::VECREDUCE_UMAX:
  5418. case ISD::VECREDUCE_UMIN:
  5419. case ISD::VECREDUCE_FADD:
  5420. case ISD::VECREDUCE_FMAX:
  5421. case ISD::VECREDUCE_FMIN:
  5422. return LowerVECREDUCE(Op, DAG);
  5423. case ISD::ATOMIC_LOAD_SUB:
  5424. return LowerATOMIC_LOAD_SUB(Op, DAG);
  5425. case ISD::ATOMIC_LOAD_AND:
  5426. return LowerATOMIC_LOAD_AND(Op, DAG);
  5427. case ISD::DYNAMIC_STACKALLOC:
  5428. return LowerDYNAMIC_STACKALLOC(Op, DAG);
  5429. case ISD::VSCALE:
  5430. return LowerVSCALE(Op, DAG);
  5431. case ISD::ANY_EXTEND:
  5432. case ISD::SIGN_EXTEND:
  5433. case ISD::ZERO_EXTEND:
  5434. return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
  5435. case ISD::SIGN_EXTEND_INREG: {
  5436. // Only custom lower when ExtraVT has a legal byte based element type.
  5437. EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
  5438. EVT ExtraEltVT = ExtraVT.getVectorElementType();
  5439. if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
  5440. (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
  5441. return SDValue();
  5442. return LowerToPredicatedOp(Op, DAG,
  5443. AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
  5444. }
  5445. case ISD::TRUNCATE:
  5446. return LowerTRUNCATE(Op, DAG);
  5447. case ISD::MLOAD:
  5448. return LowerMLOAD(Op, DAG);
  5449. case ISD::LOAD:
  5450. if (useSVEForFixedLengthVectorVT(Op.getValueType(),
  5451. Subtarget->forceStreamingCompatibleSVE()))
  5452. return LowerFixedLengthVectorLoadToSVE(Op, DAG);
  5453. return LowerLOAD(Op, DAG);
  5454. case ISD::ADD:
  5455. case ISD::AND:
  5456. case ISD::SUB:
  5457. return LowerToScalableOp(Op, DAG);
  5458. case ISD::FMAXIMUM:
  5459. return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
  5460. case ISD::FMAXNUM:
  5461. return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
  5462. case ISD::FMINIMUM:
  5463. return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
  5464. case ISD::FMINNUM:
  5465. return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
  5466. case ISD::VSELECT:
  5467. return LowerFixedLengthVectorSelectToSVE(Op, DAG);
  5468. case ISD::ABS:
  5469. return LowerABS(Op, DAG);
  5470. case ISD::ABDS:
  5471. return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
  5472. case ISD::ABDU:
  5473. return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
  5474. case ISD::AVGFLOORS:
  5475. return LowerToPredicatedOp(Op, DAG, AArch64ISD::HADDS_PRED);
  5476. case ISD::AVGFLOORU:
  5477. return LowerToPredicatedOp(Op, DAG, AArch64ISD::HADDU_PRED);
  5478. case ISD::AVGCEILS:
  5479. return LowerToPredicatedOp(Op, DAG, AArch64ISD::RHADDS_PRED);
  5480. case ISD::AVGCEILU:
  5481. return LowerToPredicatedOp(Op, DAG, AArch64ISD::RHADDU_PRED);
  5482. case ISD::BITREVERSE:
  5483. return LowerBitreverse(Op, DAG);
  5484. case ISD::BSWAP:
  5485. return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
  5486. case ISD::CTLZ:
  5487. return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
  5488. case ISD::CTTZ:
  5489. return LowerCTTZ(Op, DAG);
  5490. case ISD::VECTOR_SPLICE:
  5491. return LowerVECTOR_SPLICE(Op, DAG);
  5492. case ISD::STRICT_LROUND:
  5493. case ISD::STRICT_LLROUND:
  5494. case ISD::STRICT_LRINT:
  5495. case ISD::STRICT_LLRINT: {
  5496. assert(Op.getOperand(1).getValueType() == MVT::f16 &&
  5497. "Expected custom lowering of rounding operations only for f16");
  5498. SDLoc DL(Op);
  5499. SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
  5500. {Op.getOperand(0), Op.getOperand(1)});
  5501. return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
  5502. {Ext.getValue(1), Ext.getValue(0)});
  5503. }
  5504. case ISD::WRITE_REGISTER: {
  5505. assert(Op.getOperand(2).getValueType() == MVT::i128 &&
  5506. "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
  5507. SDLoc DL(Op);
  5508. SDValue Chain = Op.getOperand(0);
  5509. SDValue SysRegName = Op.getOperand(1);
  5510. SDValue Pair = Op.getOperand(2);
  5511. SDValue PairLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Pair,
  5512. DAG.getConstant(0, DL, MVT::i32));
  5513. SDValue PairHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Pair,
  5514. DAG.getConstant(1, DL, MVT::i32));
  5515. // chain = MSRR(chain, sysregname, lo, hi)
  5516. SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
  5517. SysRegName, PairLo, PairHi);
  5518. return Result;
  5519. }
  5520. }
  5521. }
  5522. bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
  5523. return !Subtarget->useSVEForFixedLengthVectors();
  5524. }
  5525. bool AArch64TargetLowering::isVScaleKnownToBeAPowerOfTwo() const {
  5526. return true;
  5527. }
  5528. bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
  5529. EVT VT, bool OverrideNEON) const {
  5530. if (!VT.isFixedLengthVector() || !VT.isSimple())
  5531. return false;
  5532. // Don't use SVE for vectors we cannot scalarize if required.
  5533. switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
  5534. // Fixed length predicates should be promoted to i8.
  5535. // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
  5536. case MVT::i1:
  5537. default:
  5538. return false;
  5539. case MVT::i8:
  5540. case MVT::i16:
  5541. case MVT::i32:
  5542. case MVT::i64:
  5543. case MVT::f16:
  5544. case MVT::f32:
  5545. case MVT::f64:
  5546. break;
  5547. }
  5548. // All SVE implementations support NEON sized vectors.
  5549. if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
  5550. return Subtarget->hasSVE();
  5551. // Ensure NEON MVTs only belong to a single register class.
  5552. if (VT.getFixedSizeInBits() <= 128)
  5553. return false;
  5554. // Ensure wider than NEON code generation is enabled.
  5555. if (!Subtarget->useSVEForFixedLengthVectors())
  5556. return false;
  5557. // Don't use SVE for types that don't fit.
  5558. if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
  5559. return false;
  5560. // TODO: Perhaps an artificial restriction, but worth having whilst getting
  5561. // the base fixed length SVE support in place.
  5562. if (!VT.isPow2VectorType())
  5563. return false;
  5564. return true;
  5565. }
  5566. //===----------------------------------------------------------------------===//
  5567. // Calling Convention Implementation
  5568. //===----------------------------------------------------------------------===//
  5569. static unsigned getIntrinsicID(const SDNode *N) {
  5570. unsigned Opcode = N->getOpcode();
  5571. switch (Opcode) {
  5572. default:
  5573. return Intrinsic::not_intrinsic;
  5574. case ISD::INTRINSIC_WO_CHAIN: {
  5575. unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
  5576. if (IID < Intrinsic::num_intrinsics)
  5577. return IID;
  5578. return Intrinsic::not_intrinsic;
  5579. }
  5580. }
  5581. }
  5582. bool AArch64TargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
  5583. SDValue N1) const {
  5584. if (!N0.hasOneUse())
  5585. return false;
  5586. unsigned IID = getIntrinsicID(N1.getNode());
  5587. // Avoid reassociating expressions that can be lowered to smlal/umlal.
  5588. if (IID == Intrinsic::aarch64_neon_umull ||
  5589. N1.getOpcode() == AArch64ISD::UMULL ||
  5590. IID == Intrinsic::aarch64_neon_smull ||
  5591. N1.getOpcode() == AArch64ISD::SMULL)
  5592. return N0.getOpcode() != ISD::ADD;
  5593. return true;
  5594. }
  5595. /// Selects the correct CCAssignFn for a given CallingConvention value.
  5596. CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
  5597. bool IsVarArg) const {
  5598. switch (CC) {
  5599. default:
  5600. report_fatal_error("Unsupported calling convention.");
  5601. case CallingConv::WebKit_JS:
  5602. return CC_AArch64_WebKit_JS;
  5603. case CallingConv::GHC:
  5604. return CC_AArch64_GHC;
  5605. case CallingConv::C:
  5606. case CallingConv::Fast:
  5607. case CallingConv::PreserveMost:
  5608. case CallingConv::CXX_FAST_TLS:
  5609. case CallingConv::Swift:
  5610. case CallingConv::SwiftTail:
  5611. case CallingConv::Tail:
  5612. if (Subtarget->isTargetWindows() && IsVarArg) {
  5613. if (Subtarget->isWindowsArm64EC())
  5614. return CC_AArch64_Arm64EC_VarArg;
  5615. return CC_AArch64_Win64_VarArg;
  5616. }
  5617. if (!Subtarget->isTargetDarwin())
  5618. return CC_AArch64_AAPCS;
  5619. if (!IsVarArg)
  5620. return CC_AArch64_DarwinPCS;
  5621. return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
  5622. : CC_AArch64_DarwinPCS_VarArg;
  5623. case CallingConv::Win64:
  5624. if (IsVarArg) {
  5625. if (Subtarget->isWindowsArm64EC())
  5626. return CC_AArch64_Arm64EC_VarArg;
  5627. return CC_AArch64_Win64_VarArg;
  5628. }
  5629. return CC_AArch64_AAPCS;
  5630. case CallingConv::CFGuard_Check:
  5631. return CC_AArch64_Win64_CFGuard_Check;
  5632. case CallingConv::AArch64_VectorCall:
  5633. case CallingConv::AArch64_SVE_VectorCall:
  5634. case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
  5635. case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:
  5636. return CC_AArch64_AAPCS;
  5637. }
  5638. }
  5639. CCAssignFn *
  5640. AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
  5641. return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
  5642. : RetCC_AArch64_AAPCS;
  5643. }
  5644. unsigned
  5645. AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
  5646. SelectionDAG &DAG) const {
  5647. MachineFunction &MF = DAG.getMachineFunction();
  5648. MachineFrameInfo &MFI = MF.getFrameInfo();
  5649. // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
  5650. SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
  5651. DAG.getConstant(1, DL, MVT::i32));
  5652. SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
  5653. SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)};
  5654. SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
  5655. SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops);
  5656. Chain = Buffer.getValue(1);
  5657. MFI.CreateVariableSizedObject(Align(1), nullptr);
  5658. // Allocate an additional TPIDR2 object on the stack (16 bytes)
  5659. unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false);
  5660. // Store the buffer pointer to the TPIDR2 stack object.
  5661. MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
  5662. SDValue Ptr = DAG.getFrameIndex(
  5663. TPIDR2Obj,
  5664. DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
  5665. Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI);
  5666. return TPIDR2Obj;
  5667. }
  5668. SDValue AArch64TargetLowering::LowerFormalArguments(
  5669. SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
  5670. const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
  5671. SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
  5672. MachineFunction &MF = DAG.getMachineFunction();
  5673. const Function &F = MF.getFunction();
  5674. MachineFrameInfo &MFI = MF.getFrameInfo();
  5675. bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv());
  5676. AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
  5677. SmallVector<ISD::OutputArg, 4> Outs;
  5678. GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
  5679. DAG.getTargetLoweringInfo(), MF.getDataLayout());
  5680. if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
  5681. FuncInfo->setIsSVECC(true);
  5682. // Assign locations to all of the incoming arguments.
  5683. SmallVector<CCValAssign, 16> ArgLocs;
  5684. DenseMap<unsigned, SDValue> CopiedRegs;
  5685. CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
  5686. // At this point, Ins[].VT may already be promoted to i32. To correctly
  5687. // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
  5688. // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
  5689. // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
  5690. // we use a special version of AnalyzeFormalArguments to pass in ValVT and
  5691. // LocVT.
  5692. unsigned NumArgs = Ins.size();
  5693. Function::const_arg_iterator CurOrigArg = F.arg_begin();
  5694. unsigned CurArgIdx = 0;
  5695. for (unsigned i = 0; i != NumArgs; ++i) {
  5696. MVT ValVT = Ins[i].VT;
  5697. if (Ins[i].isOrigArg()) {
  5698. std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
  5699. CurArgIdx = Ins[i].getOrigArgIndex();
  5700. // Get type of the original argument.
  5701. EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
  5702. /*AllowUnknown*/ true);
  5703. MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
  5704. // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
  5705. if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
  5706. ValVT = MVT::i8;
  5707. else if (ActualMVT == MVT::i16)
  5708. ValVT = MVT::i16;
  5709. }
  5710. bool UseVarArgCC = false;
  5711. if (IsWin64)
  5712. UseVarArgCC = isVarArg;
  5713. CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
  5714. bool Res =
  5715. AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
  5716. assert(!Res && "Call operand has unhandled type");
  5717. (void)Res;
  5718. }
  5719. SMEAttrs Attrs(MF.getFunction());
  5720. bool IsLocallyStreaming =
  5721. !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
  5722. assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
  5723. SDValue Glue = Chain.getValue(1);
  5724. SmallVector<SDValue, 16> ArgValues;
  5725. unsigned ExtraArgLocs = 0;
  5726. for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
  5727. CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
  5728. if (Ins[i].Flags.isByVal()) {
  5729. // Byval is used for HFAs in the PCS, but the system should work in a
  5730. // non-compliant manner for larger structs.
  5731. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  5732. int Size = Ins[i].Flags.getByValSize();
  5733. unsigned NumRegs = (Size + 7) / 8;
  5734. // FIXME: This works on big-endian for composite byvals, which are the common
  5735. // case. It should also work for fundamental types too.
  5736. unsigned FrameIdx =
  5737. MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
  5738. SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
  5739. InVals.push_back(FrameIdxN);
  5740. continue;
  5741. }
  5742. if (Ins[i].Flags.isSwiftAsync())
  5743. MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
  5744. SDValue ArgValue;
  5745. if (VA.isRegLoc()) {
  5746. // Arguments stored in registers.
  5747. EVT RegVT = VA.getLocVT();
  5748. const TargetRegisterClass *RC;
  5749. if (RegVT == MVT::i32)
  5750. RC = &AArch64::GPR32RegClass;
  5751. else if (RegVT == MVT::i64)
  5752. RC = &AArch64::GPR64RegClass;
  5753. else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
  5754. RC = &AArch64::FPR16RegClass;
  5755. else if (RegVT == MVT::f32)
  5756. RC = &AArch64::FPR32RegClass;
  5757. else if (RegVT == MVT::f64 || RegVT.is64BitVector())
  5758. RC = &AArch64::FPR64RegClass;
  5759. else if (RegVT == MVT::f128 || RegVT.is128BitVector())
  5760. RC = &AArch64::FPR128RegClass;
  5761. else if (RegVT.isScalableVector() &&
  5762. RegVT.getVectorElementType() == MVT::i1) {
  5763. FuncInfo->setIsSVECC(true);
  5764. RC = &AArch64::PPRRegClass;
  5765. } else if (RegVT.isScalableVector()) {
  5766. FuncInfo->setIsSVECC(true);
  5767. RC = &AArch64::ZPRRegClass;
  5768. } else
  5769. llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
  5770. // Transform the arguments in physical registers into virtual ones.
  5771. Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
  5772. if (IsLocallyStreaming) {
  5773. // LocallyStreamingFunctions must insert the SMSTART in the correct
  5774. // position, so we use Glue to ensure no instructions can be scheduled
  5775. // between the chain of:
  5776. // t0: ch,glue = EntryNode
  5777. // t1: res,ch,glue = CopyFromReg
  5778. // ...
  5779. // tn: res,ch,glue = CopyFromReg t(n-1), ..
  5780. // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
  5781. // ^^^^^^
  5782. // This will be the new Chain/Root node.
  5783. ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
  5784. Glue = ArgValue.getValue(2);
  5785. } else
  5786. ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
  5787. // If this is an 8, 16 or 32-bit value, it is really passed promoted
  5788. // to 64 bits. Insert an assert[sz]ext to capture this, then
  5789. // truncate to the right size.
  5790. switch (VA.getLocInfo()) {
  5791. default:
  5792. llvm_unreachable("Unknown loc info!");
  5793. case CCValAssign::Full:
  5794. break;
  5795. case CCValAssign::Indirect:
  5796. assert((VA.getValVT().isScalableVector() ||
  5797. Subtarget->isWindowsArm64EC()) &&
  5798. "Indirect arguments should be scalable on most subtargets");
  5799. break;
  5800. case CCValAssign::BCvt:
  5801. ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
  5802. break;
  5803. case CCValAssign::AExt:
  5804. case CCValAssign::SExt:
  5805. case CCValAssign::ZExt:
  5806. break;
  5807. case CCValAssign::AExtUpper:
  5808. ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
  5809. DAG.getConstant(32, DL, RegVT));
  5810. ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
  5811. break;
  5812. }
  5813. } else { // VA.isRegLoc()
  5814. assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
  5815. unsigned ArgOffset = VA.getLocMemOffset();
  5816. unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
  5817. ? VA.getLocVT().getSizeInBits()
  5818. : VA.getValVT().getSizeInBits()) / 8;
  5819. uint32_t BEAlign = 0;
  5820. if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
  5821. !Ins[i].Flags.isInConsecutiveRegs())
  5822. BEAlign = 8 - ArgSize;
  5823. SDValue FIN;
  5824. MachinePointerInfo PtrInfo;
  5825. if (isVarArg && Subtarget->isWindowsArm64EC()) {
  5826. // In the ARM64EC varargs convention, fixed arguments on the stack are
  5827. // accessed relative to x4, not sp.
  5828. unsigned ObjOffset = ArgOffset + BEAlign;
  5829. Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
  5830. SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
  5831. FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
  5832. DAG.getConstant(ObjOffset, DL, MVT::i64));
  5833. PtrInfo = MachinePointerInfo::getUnknownStack(MF);
  5834. } else {
  5835. int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
  5836. // Create load nodes to retrieve arguments from the stack.
  5837. FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
  5838. PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
  5839. }
  5840. // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
  5841. ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
  5842. MVT MemVT = VA.getValVT();
  5843. switch (VA.getLocInfo()) {
  5844. default:
  5845. break;
  5846. case CCValAssign::Trunc:
  5847. case CCValAssign::BCvt:
  5848. MemVT = VA.getLocVT();
  5849. break;
  5850. case CCValAssign::Indirect:
  5851. assert((VA.getValVT().isScalableVector() ||
  5852. Subtarget->isWindowsArm64EC()) &&
  5853. "Indirect arguments should be scalable on most subtargets");
  5854. MemVT = VA.getLocVT();
  5855. break;
  5856. case CCValAssign::SExt:
  5857. ExtType = ISD::SEXTLOAD;
  5858. break;
  5859. case CCValAssign::ZExt:
  5860. ExtType = ISD::ZEXTLOAD;
  5861. break;
  5862. case CCValAssign::AExt:
  5863. ExtType = ISD::EXTLOAD;
  5864. break;
  5865. }
  5866. ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
  5867. MemVT);
  5868. }
  5869. if (VA.getLocInfo() == CCValAssign::Indirect) {
  5870. assert(
  5871. (VA.getValVT().isScalableVector() || Subtarget->isWindowsArm64EC()) &&
  5872. "Indirect arguments should be scalable on most subtargets");
  5873. uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
  5874. unsigned NumParts = 1;
  5875. if (Ins[i].Flags.isInConsecutiveRegs()) {
  5876. assert(!Ins[i].Flags.isInConsecutiveRegsLast());
  5877. while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
  5878. ++NumParts;
  5879. }
  5880. MVT PartLoad = VA.getValVT();
  5881. SDValue Ptr = ArgValue;
  5882. // Ensure we generate all loads for each tuple part, whilst updating the
  5883. // pointer after each load correctly using vscale.
  5884. while (NumParts > 0) {
  5885. ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
  5886. InVals.push_back(ArgValue);
  5887. NumParts--;
  5888. if (NumParts > 0) {
  5889. SDValue BytesIncrement;
  5890. if (PartLoad.isScalableVector()) {
  5891. BytesIncrement = DAG.getVScale(
  5892. DL, Ptr.getValueType(),
  5893. APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
  5894. } else {
  5895. BytesIncrement = DAG.getConstant(
  5896. APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
  5897. Ptr.getValueType());
  5898. }
  5899. SDNodeFlags Flags;
  5900. Flags.setNoUnsignedWrap(true);
  5901. Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
  5902. BytesIncrement, Flags);
  5903. ExtraArgLocs++;
  5904. i++;
  5905. }
  5906. }
  5907. } else {
  5908. if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
  5909. ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
  5910. ArgValue, DAG.getValueType(MVT::i32));
  5911. // i1 arguments are zero-extended to i8 by the caller. Emit a
  5912. // hint to reflect this.
  5913. if (Ins[i].isOrigArg()) {
  5914. Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
  5915. if (OrigArg->getType()->isIntegerTy(1)) {
  5916. if (!Ins[i].Flags.isZExt()) {
  5917. ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
  5918. ArgValue.getValueType(), ArgValue);
  5919. }
  5920. }
  5921. }
  5922. InVals.push_back(ArgValue);
  5923. }
  5924. }
  5925. assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
  5926. // Insert the SMSTART if this is a locally streaming function and
  5927. // make sure it is Glued to the last CopyFromReg value.
  5928. if (IsLocallyStreaming) {
  5929. const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
  5930. Chain = DAG.getNode(
  5931. AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue),
  5932. {DAG.getRoot(),
  5933. DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32),
  5934. DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64),
  5935. DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()), Glue});
  5936. // Ensure that the SMSTART happens after the CopyWithChain such that its
  5937. // chain result is used.
  5938. for (unsigned I=0; I<InVals.size(); ++I) {
  5939. Register Reg = MF.getRegInfo().createVirtualRegister(
  5940. getRegClassFor(InVals[I].getValueType().getSimpleVT()));
  5941. Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
  5942. InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
  5943. InVals[I].getValueType());
  5944. }
  5945. }
  5946. // varargs
  5947. if (isVarArg) {
  5948. if (!Subtarget->isTargetDarwin() || IsWin64) {
  5949. // The AAPCS variadic function ABI is identical to the non-variadic
  5950. // one. As a result there may be more arguments in registers and we should
  5951. // save them for future reference.
  5952. // Win64 variadic functions also pass arguments in registers, but all float
  5953. // arguments are passed in integer registers.
  5954. saveVarArgRegisters(CCInfo, DAG, DL, Chain);
  5955. }
  5956. // This will point to the next argument passed via stack.
  5957. unsigned StackOffset = CCInfo.getNextStackOffset();
  5958. // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
  5959. StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8);
  5960. FuncInfo->setVarArgsStackOffset(StackOffset);
  5961. FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
  5962. if (MFI.hasMustTailInVarArgFunc()) {
  5963. SmallVector<MVT, 2> RegParmTypes;
  5964. RegParmTypes.push_back(MVT::i64);
  5965. RegParmTypes.push_back(MVT::f128);
  5966. // Compute the set of forwarded registers. The rest are scratch.
  5967. SmallVectorImpl<ForwardedRegister> &Forwards =
  5968. FuncInfo->getForwardedMustTailRegParms();
  5969. CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
  5970. CC_AArch64_AAPCS);
  5971. // Conservatively forward X8, since it might be used for aggregate return.
  5972. if (!CCInfo.isAllocated(AArch64::X8)) {
  5973. Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
  5974. Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
  5975. }
  5976. }
  5977. }
  5978. // On Windows, InReg pointers must be returned, so record the pointer in a
  5979. // virtual register at the start of the function so it can be returned in the
  5980. // epilogue.
  5981. if (IsWin64) {
  5982. for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
  5983. if (Ins[I].Flags.isInReg() && Ins[I].Flags.isSRet()) {
  5984. assert(!FuncInfo->getSRetReturnReg());
  5985. MVT PtrTy = getPointerTy(DAG.getDataLayout());
  5986. Register Reg =
  5987. MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
  5988. FuncInfo->setSRetReturnReg(Reg);
  5989. SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
  5990. Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
  5991. break;
  5992. }
  5993. }
  5994. }
  5995. unsigned StackArgSize = CCInfo.getNextStackOffset();
  5996. bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
  5997. if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
  5998. // This is a non-standard ABI so by fiat I say we're allowed to make full
  5999. // use of the stack area to be popped, which must be aligned to 16 bytes in
  6000. // any case:
  6001. StackArgSize = alignTo(StackArgSize, 16);
  6002. // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
  6003. // a multiple of 16.
  6004. FuncInfo->setArgumentStackToRestore(StackArgSize);
  6005. // This realignment carries over to the available bytes below. Our own
  6006. // callers will guarantee the space is free by giving an aligned value to
  6007. // CALLSEQ_START.
  6008. }
  6009. // Even if we're not expected to free up the space, it's useful to know how
  6010. // much is there while considering tail calls (because we can reuse it).
  6011. FuncInfo->setBytesInStackArgArea(StackArgSize);
  6012. if (Subtarget->hasCustomCallingConv())
  6013. Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
  6014. // Conservatively assume the function requires the lazy-save mechanism.
  6015. if (SMEAttrs(MF.getFunction()).hasZAState()) {
  6016. unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG);
  6017. FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);
  6018. }
  6019. return Chain;
  6020. }
  6021. void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
  6022. SelectionDAG &DAG,
  6023. const SDLoc &DL,
  6024. SDValue &Chain) const {
  6025. MachineFunction &MF = DAG.getMachineFunction();
  6026. MachineFrameInfo &MFI = MF.getFrameInfo();
  6027. AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
  6028. auto PtrVT = getPointerTy(DAG.getDataLayout());
  6029. bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
  6030. SmallVector<SDValue, 8> MemOps;
  6031. static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
  6032. AArch64::X3, AArch64::X4, AArch64::X5,
  6033. AArch64::X6, AArch64::X7 };
  6034. unsigned NumGPRArgRegs = std::size(GPRArgRegs);
  6035. if (Subtarget->isWindowsArm64EC()) {
  6036. // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
  6037. // functions.
  6038. NumGPRArgRegs = 4;
  6039. }
  6040. unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
  6041. unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
  6042. int GPRIdx = 0;
  6043. if (GPRSaveSize != 0) {
  6044. if (IsWin64) {
  6045. GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
  6046. if (GPRSaveSize & 15)
  6047. // The extra size here, if triggered, will always be 8.
  6048. MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
  6049. } else
  6050. GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
  6051. SDValue FIN;
  6052. if (Subtarget->isWindowsArm64EC()) {
  6053. // With the Arm64EC ABI, we reserve the save area as usual, but we
  6054. // compute its address relative to x4. For a normal AArch64->AArch64
  6055. // call, x4 == sp on entry, but calls from an entry thunk can pass in a
  6056. // different address.
  6057. Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
  6058. SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
  6059. FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
  6060. DAG.getConstant(GPRSaveSize, DL, MVT::i64));
  6061. } else {
  6062. FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
  6063. }
  6064. for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
  6065. Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
  6066. SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
  6067. SDValue Store =
  6068. DAG.getStore(Val.getValue(1), DL, Val, FIN,
  6069. IsWin64 ? MachinePointerInfo::getFixedStack(
  6070. MF, GPRIdx, (i - FirstVariadicGPR) * 8)
  6071. : MachinePointerInfo::getStack(MF, i * 8));
  6072. MemOps.push_back(Store);
  6073. FIN =
  6074. DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
  6075. }
  6076. }
  6077. FuncInfo->setVarArgsGPRIndex(GPRIdx);
  6078. FuncInfo->setVarArgsGPRSize(GPRSaveSize);
  6079. if (Subtarget->hasFPARMv8() && !IsWin64) {
  6080. static const MCPhysReg FPRArgRegs[] = {
  6081. AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
  6082. AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
  6083. static const unsigned NumFPRArgRegs = std::size(FPRArgRegs);
  6084. unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
  6085. unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
  6086. int FPRIdx = 0;
  6087. if (FPRSaveSize != 0) {
  6088. FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
  6089. SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
  6090. for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
  6091. Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
  6092. SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
  6093. SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
  6094. MachinePointerInfo::getStack(MF, i * 16));
  6095. MemOps.push_back(Store);
  6096. FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
  6097. DAG.getConstant(16, DL, PtrVT));
  6098. }
  6099. }
  6100. FuncInfo->setVarArgsFPRIndex(FPRIdx);
  6101. FuncInfo->setVarArgsFPRSize(FPRSaveSize);
  6102. }
  6103. if (!MemOps.empty()) {
  6104. Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
  6105. }
  6106. }
  6107. /// LowerCallResult - Lower the result values of a call into the
  6108. /// appropriate copies out of appropriate physical registers.
  6109. SDValue AArch64TargetLowering::LowerCallResult(
  6110. SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
  6111. const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
  6112. SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
  6113. SDValue ThisVal) const {
  6114. DenseMap<unsigned, SDValue> CopiedRegs;
  6115. // Copy all of the result registers out of their specified physreg.
  6116. for (unsigned i = 0; i != RVLocs.size(); ++i) {
  6117. CCValAssign VA = RVLocs[i];
  6118. // Pass 'this' value directly from the argument to return value, to avoid
  6119. // reg unit interference
  6120. if (i == 0 && isThisReturn) {
  6121. assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
  6122. "unexpected return calling convention register assignment");
  6123. InVals.push_back(ThisVal);
  6124. continue;
  6125. }
  6126. // Avoid copying a physreg twice since RegAllocFast is incompetent and only
  6127. // allows one use of a physreg per block.
  6128. SDValue Val = CopiedRegs.lookup(VA.getLocReg());
  6129. if (!Val) {
  6130. Val =
  6131. DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
  6132. Chain = Val.getValue(1);
  6133. InFlag = Val.getValue(2);
  6134. CopiedRegs[VA.getLocReg()] = Val;
  6135. }
  6136. switch (VA.getLocInfo()) {
  6137. default:
  6138. llvm_unreachable("Unknown loc info!");
  6139. case CCValAssign::Full:
  6140. break;
  6141. case CCValAssign::BCvt:
  6142. Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
  6143. break;
  6144. case CCValAssign::AExtUpper:
  6145. Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
  6146. DAG.getConstant(32, DL, VA.getLocVT()));
  6147. [[fallthrough]];
  6148. case CCValAssign::AExt:
  6149. [[fallthrough]];
  6150. case CCValAssign::ZExt:
  6151. Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
  6152. break;
  6153. }
  6154. InVals.push_back(Val);
  6155. }
  6156. return Chain;
  6157. }
  6158. /// Return true if the calling convention is one that we can guarantee TCO for.
  6159. static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
  6160. return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
  6161. CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
  6162. }
  6163. /// Return true if we might ever do TCO for calls with this calling convention.
  6164. static bool mayTailCallThisCC(CallingConv::ID CC) {
  6165. switch (CC) {
  6166. case CallingConv::C:
  6167. case CallingConv::AArch64_SVE_VectorCall:
  6168. case CallingConv::PreserveMost:
  6169. case CallingConv::Swift:
  6170. case CallingConv::SwiftTail:
  6171. case CallingConv::Tail:
  6172. case CallingConv::Fast:
  6173. return true;
  6174. default:
  6175. return false;
  6176. }
  6177. }
  6178. static void analyzeCallOperands(const AArch64TargetLowering &TLI,
  6179. const AArch64Subtarget *Subtarget,
  6180. const TargetLowering::CallLoweringInfo &CLI,
  6181. CCState &CCInfo) {
  6182. const SelectionDAG &DAG = CLI.DAG;
  6183. CallingConv::ID CalleeCC = CLI.CallConv;
  6184. bool IsVarArg = CLI.IsVarArg;
  6185. const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
  6186. bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
  6187. unsigned NumArgs = Outs.size();
  6188. for (unsigned i = 0; i != NumArgs; ++i) {
  6189. MVT ArgVT = Outs[i].VT;
  6190. ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
  6191. bool UseVarArgCC = false;
  6192. if (IsVarArg) {
  6193. // On Windows, the fixed arguments in a vararg call are passed in GPRs
  6194. // too, so use the vararg CC to force them to integer registers.
  6195. if (IsCalleeWin64) {
  6196. UseVarArgCC = true;
  6197. } else {
  6198. UseVarArgCC = !Outs[i].IsFixed;
  6199. }
  6200. }
  6201. if (!UseVarArgCC) {
  6202. // Get type of the original argument.
  6203. EVT ActualVT =
  6204. TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
  6205. /*AllowUnknown*/ true);
  6206. MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
  6207. // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
  6208. if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
  6209. ArgVT = MVT::i8;
  6210. else if (ActualMVT == MVT::i16)
  6211. ArgVT = MVT::i16;
  6212. }
  6213. CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
  6214. bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
  6215. assert(!Res && "Call operand has unhandled type");
  6216. (void)Res;
  6217. }
  6218. }
  6219. bool AArch64TargetLowering::isEligibleForTailCallOptimization(
  6220. const CallLoweringInfo &CLI) const {
  6221. CallingConv::ID CalleeCC = CLI.CallConv;
  6222. if (!mayTailCallThisCC(CalleeCC))
  6223. return false;
  6224. SDValue Callee = CLI.Callee;
  6225. bool IsVarArg = CLI.IsVarArg;
  6226. const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
  6227. const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
  6228. const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
  6229. const SelectionDAG &DAG = CLI.DAG;
  6230. MachineFunction &MF = DAG.getMachineFunction();
  6231. const Function &CallerF = MF.getFunction();
  6232. CallingConv::ID CallerCC = CallerF.getCallingConv();
  6233. // SME Streaming functions are not eligible for TCO as they may require
  6234. // the streaming mode or ZA to be restored after returning from the call.
  6235. SMEAttrs CallerAttrs(MF.getFunction());
  6236. auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
  6237. if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
  6238. CallerAttrs.requiresLazySave(CalleeAttrs))
  6239. return false;
  6240. // Functions using the C or Fast calling convention that have an SVE signature
  6241. // preserve more registers and should assume the SVE_VectorCall CC.
  6242. // The check for matching callee-saved regs will determine whether it is
  6243. // eligible for TCO.
  6244. if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
  6245. MF.getInfo<AArch64FunctionInfo>()->isSVECC())
  6246. CallerCC = CallingConv::AArch64_SVE_VectorCall;
  6247. bool CCMatch = CallerCC == CalleeCC;
  6248. // When using the Windows calling convention on a non-windows OS, we want
  6249. // to back up and restore X18 in such functions; we can't do a tail call
  6250. // from those functions.
  6251. if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
  6252. CalleeCC != CallingConv::Win64)
  6253. return false;
  6254. // Byval parameters hand the function a pointer directly into the stack area
  6255. // we want to reuse during a tail call. Working around this *is* possible (see
  6256. // X86) but less efficient and uglier in LowerCall.
  6257. for (Function::const_arg_iterator i = CallerF.arg_begin(),
  6258. e = CallerF.arg_end();
  6259. i != e; ++i) {
  6260. if (i->hasByValAttr())
  6261. return false;
  6262. // On Windows, "inreg" attributes signify non-aggregate indirect returns.
  6263. // In this case, it is necessary to save/restore X0 in the callee. Tail
  6264. // call opt interferes with this. So we disable tail call opt when the
  6265. // caller has an argument with "inreg" attribute.
  6266. // FIXME: Check whether the callee also has an "inreg" argument.
  6267. if (i->hasInRegAttr())
  6268. return false;
  6269. }
  6270. if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
  6271. return CCMatch;
  6272. // Externally-defined functions with weak linkage should not be
  6273. // tail-called on AArch64 when the OS does not support dynamic
  6274. // pre-emption of symbols, as the AAELF spec requires normal calls
  6275. // to undefined weak functions to be replaced with a NOP or jump to the
  6276. // next instruction. The behaviour of branch instructions in this
  6277. // situation (as used for tail calls) is implementation-defined, so we
  6278. // cannot rely on the linker replacing the tail call with a return.
  6279. if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
  6280. const GlobalValue *GV = G->getGlobal();
  6281. const Triple &TT = getTargetMachine().getTargetTriple();
  6282. if (GV->hasExternalWeakLinkage() &&
  6283. (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
  6284. return false;
  6285. }
  6286. // Now we search for cases where we can use a tail call without changing the
  6287. // ABI. Sibcall is used in some places (particularly gcc) to refer to this
  6288. // concept.
  6289. // I want anyone implementing a new calling convention to think long and hard
  6290. // about this assert.
  6291. assert((!IsVarArg || CalleeCC == CallingConv::C) &&
  6292. "Unexpected variadic calling convention");
  6293. LLVMContext &C = *DAG.getContext();
  6294. // Check that the call results are passed in the same way.
  6295. if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
  6296. CCAssignFnForCall(CalleeCC, IsVarArg),
  6297. CCAssignFnForCall(CallerCC, IsVarArg)))
  6298. return false;
  6299. // The callee has to preserve all registers the caller needs to preserve.
  6300. const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
  6301. const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
  6302. if (!CCMatch) {
  6303. const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
  6304. if (Subtarget->hasCustomCallingConv()) {
  6305. TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
  6306. TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
  6307. }
  6308. if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
  6309. return false;
  6310. }
  6311. // Nothing more to check if the callee is taking no arguments
  6312. if (Outs.empty())
  6313. return true;
  6314. SmallVector<CCValAssign, 16> ArgLocs;
  6315. CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
  6316. analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
  6317. if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
  6318. // When we are musttail, additional checks have been done and we can safely ignore this check
  6319. // At least two cases here: if caller is fastcc then we can't have any
  6320. // memory arguments (we'd be expected to clean up the stack afterwards). If
  6321. // caller is C then we could potentially use its argument area.
  6322. // FIXME: for now we take the most conservative of these in both cases:
  6323. // disallow all variadic memory operands.
  6324. for (const CCValAssign &ArgLoc : ArgLocs)
  6325. if (!ArgLoc.isRegLoc())
  6326. return false;
  6327. }
  6328. const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
  6329. // If any of the arguments is passed indirectly, it must be SVE, so the
  6330. // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
  6331. // allocate space on the stack. That is why we determine this explicitly here
  6332. // the call cannot be a tailcall.
  6333. if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
  6334. assert((A.getLocInfo() != CCValAssign::Indirect ||
  6335. A.getValVT().isScalableVector() ||
  6336. Subtarget->isWindowsArm64EC()) &&
  6337. "Expected value to be scalable");
  6338. return A.getLocInfo() == CCValAssign::Indirect;
  6339. }))
  6340. return false;
  6341. // If the stack arguments for this call do not fit into our own save area then
  6342. // the call cannot be made tail.
  6343. if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
  6344. return false;
  6345. const MachineRegisterInfo &MRI = MF.getRegInfo();
  6346. if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
  6347. return false;
  6348. return true;
  6349. }
  6350. SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
  6351. SelectionDAG &DAG,
  6352. MachineFrameInfo &MFI,
  6353. int ClobberedFI) const {
  6354. SmallVector<SDValue, 8> ArgChains;
  6355. int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
  6356. int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
  6357. // Include the original chain at the beginning of the list. When this is
  6358. // used by target LowerCall hooks, this helps legalize find the
  6359. // CALLSEQ_BEGIN node.
  6360. ArgChains.push_back(Chain);
  6361. // Add a chain value for each stack argument corresponding
  6362. for (SDNode *U : DAG.getEntryNode().getNode()->uses())
  6363. if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
  6364. if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
  6365. if (FI->getIndex() < 0) {
  6366. int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
  6367. int64_t InLastByte = InFirstByte;
  6368. InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
  6369. if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
  6370. (FirstByte <= InFirstByte && InFirstByte <= LastByte))
  6371. ArgChains.push_back(SDValue(L, 1));
  6372. }
  6373. // Build a tokenfactor for all the chains.
  6374. return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
  6375. }
  6376. bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
  6377. bool TailCallOpt) const {
  6378. return (CallCC == CallingConv::Fast && TailCallOpt) ||
  6379. CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
  6380. }
  6381. // Check if the value is zero-extended from i1 to i8
  6382. static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
  6383. unsigned SizeInBits = Arg.getValueType().getSizeInBits();
  6384. if (SizeInBits < 8)
  6385. return false;
  6386. APInt RequredZero(SizeInBits, 0xFE);
  6387. KnownBits Bits = DAG.computeKnownBits(Arg, 4);
  6388. bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
  6389. return ZExtBool;
  6390. }
  6391. SDValue AArch64TargetLowering::changeStreamingMode(
  6392. SelectionDAG &DAG, SDLoc DL, bool Enable,
  6393. SDValue Chain, SDValue InFlag, SDValue PStateSM, bool Entry) const {
  6394. const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
  6395. SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
  6396. SDValue MSROp =
  6397. DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
  6398. SDValue ExpectedSMVal =
  6399. DAG.getTargetConstant(Entry ? Enable : !Enable, DL, MVT::i64);
  6400. SmallVector<SDValue> Ops = {Chain, MSROp, PStateSM, ExpectedSMVal, RegMask};
  6401. if (InFlag)
  6402. Ops.push_back(InFlag);
  6403. unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
  6404. return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
  6405. }
  6406. /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
  6407. /// and add input and output parameter nodes.
  6408. SDValue
  6409. AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
  6410. SmallVectorImpl<SDValue> &InVals) const {
  6411. SelectionDAG &DAG = CLI.DAG;
  6412. SDLoc &DL = CLI.DL;
  6413. SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
  6414. SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
  6415. SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
  6416. SDValue Chain = CLI.Chain;
  6417. SDValue Callee = CLI.Callee;
  6418. bool &IsTailCall = CLI.IsTailCall;
  6419. CallingConv::ID &CallConv = CLI.CallConv;
  6420. bool IsVarArg = CLI.IsVarArg;
  6421. MachineFunction &MF = DAG.getMachineFunction();
  6422. MachineFunction::CallSiteInfo CSInfo;
  6423. bool IsThisReturn = false;
  6424. AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
  6425. bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
  6426. bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
  6427. bool IsSibCall = false;
  6428. bool GuardWithBTI = false;
  6429. if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
  6430. !Subtarget->noBTIAtReturnTwice()) {
  6431. GuardWithBTI = FuncInfo->branchTargetEnforcement();
  6432. }
  6433. // Analyze operands of the call, assigning locations to each operand.
  6434. SmallVector<CCValAssign, 16> ArgLocs;
  6435. CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
  6436. if (IsVarArg) {
  6437. unsigned NumArgs = Outs.size();
  6438. for (unsigned i = 0; i != NumArgs; ++i) {
  6439. if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
  6440. report_fatal_error("Passing SVE types to variadic functions is "
  6441. "currently not supported");
  6442. }
  6443. }
  6444. analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
  6445. CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
  6446. // Assign locations to each value returned by this call.
  6447. SmallVector<CCValAssign, 16> RVLocs;
  6448. CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
  6449. *DAG.getContext());
  6450. RetCCInfo.AnalyzeCallResult(Ins, RetCC);
  6451. // Check callee args/returns for SVE registers and set calling convention
  6452. // accordingly.
  6453. if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
  6454. auto HasSVERegLoc = [](CCValAssign &Loc) {
  6455. if (!Loc.isRegLoc())
  6456. return false;
  6457. return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
  6458. AArch64::PPRRegClass.contains(Loc.getLocReg());
  6459. };
  6460. if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
  6461. CallConv = CallingConv::AArch64_SVE_VectorCall;
  6462. }
  6463. if (IsTailCall) {
  6464. // Check if it's really possible to do a tail call.
  6465. IsTailCall = isEligibleForTailCallOptimization(CLI);
  6466. // A sibling call is one where we're under the usual C ABI and not planning
  6467. // to change that but can still do a tail call:
  6468. if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
  6469. CallConv != CallingConv::SwiftTail)
  6470. IsSibCall = true;
  6471. if (IsTailCall)
  6472. ++NumTailCalls;
  6473. }
  6474. if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
  6475. report_fatal_error("failed to perform tail call elimination on a call "
  6476. "site marked musttail");
  6477. // Get a count of how many bytes are to be pushed on the stack.
  6478. unsigned NumBytes = CCInfo.getNextStackOffset();
  6479. if (IsSibCall) {
  6480. // Since we're not changing the ABI to make this a tail call, the memory
  6481. // operands are already available in the caller's incoming argument space.
  6482. NumBytes = 0;
  6483. }
  6484. // FPDiff is the byte offset of the call's argument area from the callee's.
  6485. // Stores to callee stack arguments will be placed in FixedStackSlots offset
  6486. // by this amount for a tail call. In a sibling call it must be 0 because the
  6487. // caller will deallocate the entire stack and the callee still expects its
  6488. // arguments to begin at SP+0. Completely unused for non-tail calls.
  6489. int FPDiff = 0;
  6490. if (IsTailCall && !IsSibCall) {
  6491. unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
  6492. // Since callee will pop argument stack as a tail call, we must keep the
  6493. // popped size 16-byte aligned.
  6494. NumBytes = alignTo(NumBytes, 16);
  6495. // FPDiff will be negative if this tail call requires more space than we
  6496. // would automatically have in our incoming argument space. Positive if we
  6497. // can actually shrink the stack.
  6498. FPDiff = NumReusableBytes - NumBytes;
  6499. // Update the required reserved area if this is the tail call requiring the
  6500. // most argument stack space.
  6501. if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
  6502. FuncInfo->setTailCallReservedStack(-FPDiff);
  6503. // The stack pointer must be 16-byte aligned at all times it's used for a
  6504. // memory operation, which in practice means at *all* times and in
  6505. // particular across call boundaries. Therefore our own arguments started at
  6506. // a 16-byte aligned SP and the delta applied for the tail call should
  6507. // satisfy the same constraint.
  6508. assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
  6509. }
  6510. // Determine whether we need any streaming mode changes.
  6511. SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
  6512. if (CLI.CB)
  6513. CalleeAttrs = SMEAttrs(*CLI.CB);
  6514. else if (std::optional<SMEAttrs> Attrs =
  6515. getCalleeAttrsFromExternalFunction(CLI.Callee))
  6516. CalleeAttrs = *Attrs;
  6517. bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
  6518. MachineFrameInfo &MFI = MF.getFrameInfo();
  6519. if (RequiresLazySave) {
  6520. // Set up a lazy save mechanism by storing the runtime live slices
  6521. // (worst-case N*N) to the TPIDR2 stack object.
  6522. SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
  6523. DAG.getConstant(1, DL, MVT::i32));
  6524. SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
  6525. unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
  6526. MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
  6527. SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
  6528. DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
  6529. SDValue BufferPtrAddr =
  6530. DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
  6531. DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
  6532. Chain = DAG.getTruncStore(Chain, DL, NN, BufferPtrAddr, MPI, MVT::i16);
  6533. Chain = DAG.getNode(
  6534. ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
  6535. DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
  6536. TPIDR2ObjAddr);
  6537. }
  6538. SDValue PStateSM;
  6539. std::optional<bool> RequiresSMChange =
  6540. CallerAttrs.requiresSMChange(CalleeAttrs);
  6541. if (RequiresSMChange)
  6542. PStateSM = getPStateSM(DAG, Chain, CallerAttrs, DL, MVT::i64);
  6543. // Adjust the stack pointer for the new arguments...
  6544. // These operations are automatically eliminated by the prolog/epilog pass
  6545. if (!IsSibCall)
  6546. Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
  6547. SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
  6548. getPointerTy(DAG.getDataLayout()));
  6549. SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
  6550. SmallSet<unsigned, 8> RegsUsed;
  6551. SmallVector<SDValue, 8> MemOpChains;
  6552. auto PtrVT = getPointerTy(DAG.getDataLayout());
  6553. if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
  6554. const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
  6555. for (const auto &F : Forwards) {
  6556. SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
  6557. RegsToPass.emplace_back(F.PReg, Val);
  6558. }
  6559. }
  6560. // Walk the register/memloc assignments, inserting copies/loads.
  6561. unsigned ExtraArgLocs = 0;
  6562. for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
  6563. CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
  6564. SDValue Arg = OutVals[i];
  6565. ISD::ArgFlagsTy Flags = Outs[i].Flags;
  6566. // Promote the value if needed.
  6567. switch (VA.getLocInfo()) {
  6568. default:
  6569. llvm_unreachable("Unknown loc info!");
  6570. case CCValAssign::Full:
  6571. break;
  6572. case CCValAssign::SExt:
  6573. Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
  6574. break;
  6575. case CCValAssign::ZExt:
  6576. Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
  6577. break;
  6578. case CCValAssign::AExt:
  6579. if (Outs[i].ArgVT == MVT::i1) {
  6580. // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
  6581. //
  6582. // Check if we actually have to do this, because the value may
  6583. // already be zero-extended.
  6584. //
  6585. // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
  6586. // and rely on DAGCombiner to fold this, because the following
  6587. // (anyext i32) is combined with (zext i8) in DAG.getNode:
  6588. //
  6589. // (ext (zext x)) -> (zext x)
  6590. //
  6591. // This will give us (zext i32), which we cannot remove, so
  6592. // try to check this beforehand.
  6593. if (!checkZExtBool(Arg, DAG)) {
  6594. Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
  6595. Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
  6596. }
  6597. }
  6598. Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
  6599. break;
  6600. case CCValAssign::AExtUpper:
  6601. assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
  6602. Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
  6603. Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
  6604. DAG.getConstant(32, DL, VA.getLocVT()));
  6605. break;
  6606. case CCValAssign::BCvt:
  6607. Arg = DAG.getBitcast(VA.getLocVT(), Arg);
  6608. break;
  6609. case CCValAssign::Trunc:
  6610. Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
  6611. break;
  6612. case CCValAssign::FPExt:
  6613. Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
  6614. break;
  6615. case CCValAssign::Indirect:
  6616. bool isScalable = VA.getValVT().isScalableVector();
  6617. assert((isScalable || Subtarget->isWindowsArm64EC()) &&
  6618. "Indirect arguments should be scalable on most subtargets");
  6619. uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
  6620. uint64_t PartSize = StoreSize;
  6621. unsigned NumParts = 1;
  6622. if (Outs[i].Flags.isInConsecutiveRegs()) {
  6623. assert(!Outs[i].Flags.isInConsecutiveRegsLast());
  6624. while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
  6625. ++NumParts;
  6626. StoreSize *= NumParts;
  6627. }
  6628. Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
  6629. Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
  6630. int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
  6631. if (isScalable)
  6632. MFI.setStackID(FI, TargetStackID::ScalableVector);
  6633. MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
  6634. SDValue Ptr = DAG.getFrameIndex(
  6635. FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
  6636. SDValue SpillSlot = Ptr;
  6637. // Ensure we generate all stores for each tuple part, whilst updating the
  6638. // pointer after each store correctly using vscale.
  6639. while (NumParts) {
  6640. Chain = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
  6641. NumParts--;
  6642. if (NumParts > 0) {
  6643. SDValue BytesIncrement;
  6644. if (isScalable) {
  6645. BytesIncrement = DAG.getVScale(
  6646. DL, Ptr.getValueType(),
  6647. APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
  6648. } else {
  6649. BytesIncrement = DAG.getConstant(
  6650. APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
  6651. Ptr.getValueType());
  6652. }
  6653. SDNodeFlags Flags;
  6654. Flags.setNoUnsignedWrap(true);
  6655. MPI = MachinePointerInfo(MPI.getAddrSpace());
  6656. Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
  6657. BytesIncrement, Flags);
  6658. ExtraArgLocs++;
  6659. i++;
  6660. }
  6661. }
  6662. Arg = SpillSlot;
  6663. break;
  6664. }
  6665. if (VA.isRegLoc()) {
  6666. if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
  6667. Outs[0].VT == MVT::i64) {
  6668. assert(VA.getLocVT() == MVT::i64 &&
  6669. "unexpected calling convention register assignment");
  6670. assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
  6671. "unexpected use of 'returned'");
  6672. IsThisReturn = true;
  6673. }
  6674. if (RegsUsed.count(VA.getLocReg())) {
  6675. // If this register has already been used then we're trying to pack
  6676. // parts of an [N x i32] into an X-register. The extension type will
  6677. // take care of putting the two halves in the right place but we have to
  6678. // combine them.
  6679. SDValue &Bits =
  6680. llvm::find_if(RegsToPass,
  6681. [=](const std::pair<unsigned, SDValue> &Elt) {
  6682. return Elt.first == VA.getLocReg();
  6683. })
  6684. ->second;
  6685. Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
  6686. // Call site info is used for function's parameter entry value
  6687. // tracking. For now we track only simple cases when parameter
  6688. // is transferred through whole register.
  6689. llvm::erase_if(CSInfo, [&VA](MachineFunction::ArgRegPair ArgReg) {
  6690. return ArgReg.Reg == VA.getLocReg();
  6691. });
  6692. } else {
  6693. // Add an extra level of indirection for streaming mode changes by
  6694. // using a pseudo copy node that cannot be rematerialised between a
  6695. // smstart/smstop and the call by the simple register coalescer.
  6696. if (RequiresSMChange && isa<FrameIndexSDNode>(Arg))
  6697. Arg = DAG.getNode(AArch64ISD::OBSCURE_COPY, DL, MVT::i64, Arg);
  6698. RegsToPass.emplace_back(VA.getLocReg(), Arg);
  6699. RegsUsed.insert(VA.getLocReg());
  6700. const TargetOptions &Options = DAG.getTarget().Options;
  6701. if (Options.EmitCallSiteInfo)
  6702. CSInfo.emplace_back(VA.getLocReg(), i);
  6703. }
  6704. } else {
  6705. assert(VA.isMemLoc());
  6706. SDValue DstAddr;
  6707. MachinePointerInfo DstInfo;
  6708. // FIXME: This works on big-endian for composite byvals, which are the
  6709. // common case. It should also work for fundamental types too.
  6710. uint32_t BEAlign = 0;
  6711. unsigned OpSize;
  6712. if (VA.getLocInfo() == CCValAssign::Indirect ||
  6713. VA.getLocInfo() == CCValAssign::Trunc)
  6714. OpSize = VA.getLocVT().getFixedSizeInBits();
  6715. else
  6716. OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
  6717. : VA.getValVT().getSizeInBits();
  6718. OpSize = (OpSize + 7) / 8;
  6719. if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
  6720. !Flags.isInConsecutiveRegs()) {
  6721. if (OpSize < 8)
  6722. BEAlign = 8 - OpSize;
  6723. }
  6724. unsigned LocMemOffset = VA.getLocMemOffset();
  6725. int32_t Offset = LocMemOffset + BEAlign;
  6726. SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
  6727. PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
  6728. if (IsTailCall) {
  6729. Offset = Offset + FPDiff;
  6730. int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
  6731. DstAddr = DAG.getFrameIndex(FI, PtrVT);
  6732. DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
  6733. // Make sure any stack arguments overlapping with where we're storing
  6734. // are loaded before this eventual operation. Otherwise they'll be
  6735. // clobbered.
  6736. Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
  6737. } else {
  6738. SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
  6739. DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
  6740. DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
  6741. }
  6742. if (Outs[i].Flags.isByVal()) {
  6743. SDValue SizeNode =
  6744. DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
  6745. SDValue Cpy = DAG.getMemcpy(
  6746. Chain, DL, DstAddr, Arg, SizeNode,
  6747. Outs[i].Flags.getNonZeroByValAlign(),
  6748. /*isVol = */ false, /*AlwaysInline = */ false,
  6749. /*isTailCall = */ false, DstInfo, MachinePointerInfo());
  6750. MemOpChains.push_back(Cpy);
  6751. } else {
  6752. // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
  6753. // promoted to a legal register type i32, we should truncate Arg back to
  6754. // i1/i8/i16.
  6755. if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
  6756. VA.getValVT() == MVT::i16)
  6757. Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
  6758. SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
  6759. MemOpChains.push_back(Store);
  6760. }
  6761. }
  6762. }
  6763. if (IsVarArg && Subtarget->isWindowsArm64EC()) {
  6764. // For vararg calls, the Arm64EC ABI requires values in x4 and x5
  6765. // describing the argument list. x4 contains the address of the
  6766. // first stack parameter. x5 contains the size in bytes of all parameters
  6767. // passed on the stack.
  6768. RegsToPass.emplace_back(AArch64::X4, StackPtr);
  6769. RegsToPass.emplace_back(AArch64::X5,
  6770. DAG.getConstant(NumBytes, DL, MVT::i64));
  6771. }
  6772. if (!MemOpChains.empty())
  6773. Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
  6774. SDValue InFlag;
  6775. if (RequiresSMChange) {
  6776. SDValue NewChain = changeStreamingMode(DAG, DL, *RequiresSMChange, Chain,
  6777. InFlag, PStateSM, true);
  6778. Chain = NewChain.getValue(0);
  6779. InFlag = NewChain.getValue(1);
  6780. }
  6781. // Build a sequence of copy-to-reg nodes chained together with token chain
  6782. // and flag operands which copy the outgoing args into the appropriate regs.
  6783. for (auto &RegToPass : RegsToPass) {
  6784. Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
  6785. RegToPass.second, InFlag);
  6786. InFlag = Chain.getValue(1);
  6787. }
  6788. // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
  6789. // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
  6790. // node so that legalize doesn't hack it.
  6791. if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
  6792. auto GV = G->getGlobal();
  6793. unsigned OpFlags =
  6794. Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine());
  6795. if (OpFlags & AArch64II::MO_GOT) {
  6796. Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
  6797. Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
  6798. } else {
  6799. const GlobalValue *GV = G->getGlobal();
  6800. Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
  6801. }
  6802. } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
  6803. if (getTargetMachine().getCodeModel() == CodeModel::Large &&
  6804. Subtarget->isTargetMachO()) {
  6805. const char *Sym = S->getSymbol();
  6806. Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
  6807. Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
  6808. } else {
  6809. const char *Sym = S->getSymbol();
  6810. Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
  6811. }
  6812. }
  6813. // We don't usually want to end the call-sequence here because we would tidy
  6814. // the frame up *after* the call, however in the ABI-changing tail-call case
  6815. // we've carefully laid out the parameters so that when sp is reset they'll be
  6816. // in the correct location.
  6817. if (IsTailCall && !IsSibCall) {
  6818. Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InFlag, DL);
  6819. InFlag = Chain.getValue(1);
  6820. }
  6821. std::vector<SDValue> Ops;
  6822. Ops.push_back(Chain);
  6823. Ops.push_back(Callee);
  6824. if (IsTailCall) {
  6825. // Each tail call may have to adjust the stack by a different amount, so
  6826. // this information must travel along with the operation for eventual
  6827. // consumption by emitEpilogue.
  6828. Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
  6829. }
  6830. // Add argument registers to the end of the list so that they are known live
  6831. // into the call.
  6832. for (auto &RegToPass : RegsToPass)
  6833. Ops.push_back(DAG.getRegister(RegToPass.first,
  6834. RegToPass.second.getValueType()));
  6835. // Add a register mask operand representing the call-preserved registers.
  6836. const uint32_t *Mask;
  6837. const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
  6838. if (IsThisReturn) {
  6839. // For 'this' returns, use the X0-preserving mask if applicable
  6840. Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
  6841. if (!Mask) {
  6842. IsThisReturn = false;
  6843. Mask = TRI->getCallPreservedMask(MF, CallConv);
  6844. }
  6845. } else
  6846. Mask = TRI->getCallPreservedMask(MF, CallConv);
  6847. if (Subtarget->hasCustomCallingConv())
  6848. TRI->UpdateCustomCallPreservedMask(MF, &Mask);
  6849. if (TRI->isAnyArgRegReserved(MF))
  6850. TRI->emitReservedArgRegCallError(MF);
  6851. assert(Mask && "Missing call preserved mask for calling convention");
  6852. Ops.push_back(DAG.getRegisterMask(Mask));
  6853. if (InFlag.getNode())
  6854. Ops.push_back(InFlag);
  6855. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
  6856. // If we're doing a tall call, use a TC_RETURN here rather than an
  6857. // actual call instruction.
  6858. if (IsTailCall) {
  6859. MF.getFrameInfo().setHasTailCall();
  6860. SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
  6861. if (IsCFICall)
  6862. Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
  6863. DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
  6864. return Ret;
  6865. }
  6866. unsigned CallOpc = AArch64ISD::CALL;
  6867. // Calls with operand bundle "clang.arc.attachedcall" are special. They should
  6868. // be expanded to the call, directly followed by a special marker sequence and
  6869. // a call to an ObjC library function. Use CALL_RVMARKER to do that.
  6870. if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
  6871. assert(!IsTailCall &&
  6872. "tail calls cannot be marked with clang.arc.attachedcall");
  6873. CallOpc = AArch64ISD::CALL_RVMARKER;
  6874. // Add a target global address for the retainRV/claimRV runtime function
  6875. // just before the call target.
  6876. Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
  6877. auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
  6878. Ops.insert(Ops.begin() + 1, GA);
  6879. } else if (GuardWithBTI)
  6880. CallOpc = AArch64ISD::CALL_BTI;
  6881. // Returns a chain and a flag for retval copy to use.
  6882. Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
  6883. if (IsCFICall)
  6884. Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
  6885. DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
  6886. InFlag = Chain.getValue(1);
  6887. DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
  6888. uint64_t CalleePopBytes =
  6889. DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
  6890. Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InFlag, DL);
  6891. InFlag = Chain.getValue(1);
  6892. // Handle result values, copying them out of physregs into vregs that we
  6893. // return.
  6894. SDValue Result = LowerCallResult(Chain, InFlag, CallConv, IsVarArg, RVLocs,
  6895. DL, DAG, InVals, IsThisReturn,
  6896. IsThisReturn ? OutVals[0] : SDValue());
  6897. if (!Ins.empty())
  6898. InFlag = Result.getValue(Result->getNumValues() - 1);
  6899. if (RequiresSMChange) {
  6900. assert(PStateSM && "Expected a PStateSM to be set");
  6901. Result = changeStreamingMode(DAG, DL, !*RequiresSMChange, Result, InFlag,
  6902. PStateSM, false);
  6903. }
  6904. if (RequiresLazySave) {
  6905. // Unconditionally resume ZA.
  6906. Result = DAG.getNode(
  6907. AArch64ISD::SMSTART, DL, MVT::Other, Result,
  6908. DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
  6909. DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
  6910. // Conditionally restore the lazy save using a pseudo node.
  6911. unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
  6912. SDValue RegMask = DAG.getRegisterMask(
  6913. TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
  6914. SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
  6915. "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
  6916. SDValue TPIDR2_EL0 = DAG.getNode(
  6917. ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
  6918. DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
  6919. // Copy the address of the TPIDR2 block into X0 before 'calling' the
  6920. // RESTORE_ZA pseudo.
  6921. SDValue Glue;
  6922. SDValue TPIDR2Block = DAG.getFrameIndex(
  6923. FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
  6924. Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
  6925. Result = DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
  6926. {Result, TPIDR2_EL0,
  6927. DAG.getRegister(AArch64::X0, MVT::i64),
  6928. RestoreRoutine,
  6929. RegMask,
  6930. Result.getValue(1)});
  6931. // Finally reset the TPIDR2_EL0 register to 0.
  6932. Result = DAG.getNode(
  6933. ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
  6934. DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
  6935. DAG.getConstant(0, DL, MVT::i64));
  6936. }
  6937. if (RequiresSMChange || RequiresLazySave) {
  6938. for (unsigned I = 0; I < InVals.size(); ++I) {
  6939. // The smstart/smstop is chained as part of the call, but when the
  6940. // resulting chain is discarded (which happens when the call is not part
  6941. // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
  6942. // smstart/smstop is chained to the result value. We can do that by doing
  6943. // a vreg -> vreg copy.
  6944. Register Reg = MF.getRegInfo().createVirtualRegister(
  6945. getRegClassFor(InVals[I].getValueType().getSimpleVT()));
  6946. SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
  6947. InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
  6948. InVals[I].getValueType());
  6949. }
  6950. }
  6951. return Result;
  6952. }
  6953. bool AArch64TargetLowering::CanLowerReturn(
  6954. CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
  6955. const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
  6956. CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
  6957. SmallVector<CCValAssign, 16> RVLocs;
  6958. CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
  6959. return CCInfo.CheckReturn(Outs, RetCC);
  6960. }
  6961. SDValue
  6962. AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
  6963. bool isVarArg,
  6964. const SmallVectorImpl<ISD::OutputArg> &Outs,
  6965. const SmallVectorImpl<SDValue> &OutVals,
  6966. const SDLoc &DL, SelectionDAG &DAG) const {
  6967. auto &MF = DAG.getMachineFunction();
  6968. auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
  6969. CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
  6970. SmallVector<CCValAssign, 16> RVLocs;
  6971. CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
  6972. CCInfo.AnalyzeReturn(Outs, RetCC);
  6973. // Copy the result values into the output registers.
  6974. SDValue Flag;
  6975. SmallVector<std::pair<unsigned, SDValue>, 4> RetVals;
  6976. SmallSet<unsigned, 4> RegsUsed;
  6977. for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
  6978. ++i, ++realRVLocIdx) {
  6979. CCValAssign &VA = RVLocs[i];
  6980. assert(VA.isRegLoc() && "Can only return in registers!");
  6981. SDValue Arg = OutVals[realRVLocIdx];
  6982. switch (VA.getLocInfo()) {
  6983. default:
  6984. llvm_unreachable("Unknown loc info!");
  6985. case CCValAssign::Full:
  6986. if (Outs[i].ArgVT == MVT::i1) {
  6987. // AAPCS requires i1 to be zero-extended to i8 by the producer of the
  6988. // value. This is strictly redundant on Darwin (which uses "zeroext
  6989. // i1"), but will be optimised out before ISel.
  6990. Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
  6991. Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
  6992. }
  6993. break;
  6994. case CCValAssign::BCvt:
  6995. Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
  6996. break;
  6997. case CCValAssign::AExt:
  6998. case CCValAssign::ZExt:
  6999. Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
  7000. break;
  7001. case CCValAssign::AExtUpper:
  7002. assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
  7003. Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
  7004. Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
  7005. DAG.getConstant(32, DL, VA.getLocVT()));
  7006. break;
  7007. }
  7008. if (RegsUsed.count(VA.getLocReg())) {
  7009. SDValue &Bits =
  7010. llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
  7011. return Elt.first == VA.getLocReg();
  7012. })->second;
  7013. Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
  7014. } else {
  7015. RetVals.emplace_back(VA.getLocReg(), Arg);
  7016. RegsUsed.insert(VA.getLocReg());
  7017. }
  7018. }
  7019. const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
  7020. // Emit SMSTOP before returning from a locally streaming function
  7021. SMEAttrs FuncAttrs(MF.getFunction());
  7022. if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
  7023. Chain = DAG.getNode(
  7024. AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
  7025. DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32),
  7026. DAG.getConstant(1, DL, MVT::i64), DAG.getConstant(0, DL, MVT::i64),
  7027. DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()));
  7028. Flag = Chain.getValue(1);
  7029. }
  7030. SmallVector<SDValue, 4> RetOps(1, Chain);
  7031. for (auto &RetVal : RetVals) {
  7032. Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag);
  7033. Flag = Chain.getValue(1);
  7034. RetOps.push_back(
  7035. DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
  7036. }
  7037. // Windows AArch64 ABIs require that for returning structs by value we copy
  7038. // the sret argument into X0 for the return.
  7039. // We saved the argument into a virtual register in the entry block,
  7040. // so now we copy the value out and into X0.
  7041. if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
  7042. SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
  7043. getPointerTy(MF.getDataLayout()));
  7044. unsigned RetValReg = AArch64::X0;
  7045. Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag);
  7046. Flag = Chain.getValue(1);
  7047. RetOps.push_back(
  7048. DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
  7049. }
  7050. const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
  7051. if (I) {
  7052. for (; *I; ++I) {
  7053. if (AArch64::GPR64RegClass.contains(*I))
  7054. RetOps.push_back(DAG.getRegister(*I, MVT::i64));
  7055. else if (AArch64::FPR64RegClass.contains(*I))
  7056. RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
  7057. else
  7058. llvm_unreachable("Unexpected register class in CSRsViaCopy!");
  7059. }
  7060. }
  7061. RetOps[0] = Chain; // Update chain.
  7062. // Add the flag if we have it.
  7063. if (Flag.getNode())
  7064. RetOps.push_back(Flag);
  7065. return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
  7066. }
  7067. //===----------------------------------------------------------------------===//
  7068. // Other Lowering Code
  7069. //===----------------------------------------------------------------------===//
  7070. SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
  7071. SelectionDAG &DAG,
  7072. unsigned Flag) const {
  7073. return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
  7074. N->getOffset(), Flag);
  7075. }
  7076. SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
  7077. SelectionDAG &DAG,
  7078. unsigned Flag) const {
  7079. return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
  7080. }
  7081. SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
  7082. SelectionDAG &DAG,
  7083. unsigned Flag) const {
  7084. return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
  7085. N->getOffset(), Flag);
  7086. }
  7087. SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
  7088. SelectionDAG &DAG,
  7089. unsigned Flag) const {
  7090. return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
  7091. }
  7092. // (loadGOT sym)
  7093. template <class NodeTy>
  7094. SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
  7095. unsigned Flags) const {
  7096. LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
  7097. SDLoc DL(N);
  7098. EVT Ty = getPointerTy(DAG.getDataLayout());
  7099. SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
  7100. // FIXME: Once remat is capable of dealing with instructions with register
  7101. // operands, expand this into two nodes instead of using a wrapper node.
  7102. return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
  7103. }
  7104. // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
  7105. template <class NodeTy>
  7106. SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
  7107. unsigned Flags) const {
  7108. LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
  7109. SDLoc DL(N);
  7110. EVT Ty = getPointerTy(DAG.getDataLayout());
  7111. const unsigned char MO_NC = AArch64II::MO_NC;
  7112. return DAG.getNode(
  7113. AArch64ISD::WrapperLarge, DL, Ty,
  7114. getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
  7115. getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
  7116. getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
  7117. getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
  7118. }
  7119. // (addlow (adrp %hi(sym)) %lo(sym))
  7120. template <class NodeTy>
  7121. SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
  7122. unsigned Flags) const {
  7123. LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
  7124. SDLoc DL(N);
  7125. EVT Ty = getPointerTy(DAG.getDataLayout());
  7126. SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
  7127. SDValue Lo = getTargetNode(N, Ty, DAG,
  7128. AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags);
  7129. SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
  7130. return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
  7131. }
  7132. // (adr sym)
  7133. template <class NodeTy>
  7134. SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
  7135. unsigned Flags) const {
  7136. LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
  7137. SDLoc DL(N);
  7138. EVT Ty = getPointerTy(DAG.getDataLayout());
  7139. SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
  7140. return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
  7141. }
  7142. SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
  7143. SelectionDAG &DAG) const {
  7144. GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
  7145. const GlobalValue *GV = GN->getGlobal();
  7146. unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
  7147. if (OpFlags != AArch64II::MO_NO_FLAG)
  7148. assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
  7149. "unexpected offset in global node");
  7150. // This also catches the large code model case for Darwin, and tiny code
  7151. // model with got relocations.
  7152. if ((OpFlags & AArch64II::MO_GOT) != 0) {
  7153. return getGOT(GN, DAG, OpFlags);
  7154. }
  7155. SDValue Result;
  7156. if (getTargetMachine().getCodeModel() == CodeModel::Large) {
  7157. Result = getAddrLarge(GN, DAG, OpFlags);
  7158. } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
  7159. Result = getAddrTiny(GN, DAG, OpFlags);
  7160. } else {
  7161. Result = getAddr(GN, DAG, OpFlags);
  7162. }
  7163. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  7164. SDLoc DL(GN);
  7165. if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_DLLIMPORTAUX |
  7166. AArch64II::MO_COFFSTUB))
  7167. Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
  7168. MachinePointerInfo::getGOT(DAG.getMachineFunction()));
  7169. return Result;
  7170. }
  7171. /// Convert a TLS address reference into the correct sequence of loads
  7172. /// and calls to compute the variable's address (for Darwin, currently) and
  7173. /// return an SDValue containing the final node.
  7174. /// Darwin only has one TLS scheme which must be capable of dealing with the
  7175. /// fully general situation, in the worst case. This means:
  7176. /// + "extern __thread" declaration.
  7177. /// + Defined in a possibly unknown dynamic library.
  7178. ///
  7179. /// The general system is that each __thread variable has a [3 x i64] descriptor
  7180. /// which contains information used by the runtime to calculate the address. The
  7181. /// only part of this the compiler needs to know about is the first xword, which
  7182. /// contains a function pointer that must be called with the address of the
  7183. /// entire descriptor in "x0".
  7184. ///
  7185. /// Since this descriptor may be in a different unit, in general even the
  7186. /// descriptor must be accessed via an indirect load. The "ideal" code sequence
  7187. /// is:
  7188. /// adrp x0, _var@TLVPPAGE
  7189. /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
  7190. /// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
  7191. /// ; the function pointer
  7192. /// blr x1 ; Uses descriptor address in x0
  7193. /// ; Address of _var is now in x0.
  7194. ///
  7195. /// If the address of _var's descriptor *is* known to the linker, then it can
  7196. /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
  7197. /// a slight efficiency gain.
  7198. SDValue
  7199. AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
  7200. SelectionDAG &DAG) const {
  7201. assert(Subtarget->isTargetDarwin() &&
  7202. "This function expects a Darwin target");
  7203. SDLoc DL(Op);
  7204. MVT PtrVT = getPointerTy(DAG.getDataLayout());
  7205. MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
  7206. const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
  7207. SDValue TLVPAddr =
  7208. DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
  7209. SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
  7210. // The first entry in the descriptor is a function pointer that we must call
  7211. // to obtain the address of the variable.
  7212. SDValue Chain = DAG.getEntryNode();
  7213. SDValue FuncTLVGet = DAG.getLoad(
  7214. PtrMemVT, DL, Chain, DescAddr,
  7215. MachinePointerInfo::getGOT(DAG.getMachineFunction()),
  7216. Align(PtrMemVT.getSizeInBits() / 8),
  7217. MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
  7218. Chain = FuncTLVGet.getValue(1);
  7219. // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
  7220. FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
  7221. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
  7222. MFI.setAdjustsStack(true);
  7223. // TLS calls preserve all registers except those that absolutely must be
  7224. // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
  7225. // silly).
  7226. const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
  7227. const uint32_t *Mask = TRI->getTLSCallPreservedMask();
  7228. if (Subtarget->hasCustomCallingConv())
  7229. TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
  7230. // Finally, we can make the call. This is just a degenerate version of a
  7231. // normal AArch64 call node: x0 takes the address of the descriptor, and
  7232. // returns the address of the variable in this thread.
  7233. Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
  7234. Chain =
  7235. DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
  7236. Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
  7237. DAG.getRegisterMask(Mask), Chain.getValue(1));
  7238. return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
  7239. }
  7240. /// Convert a thread-local variable reference into a sequence of instructions to
  7241. /// compute the variable's address for the local exec TLS model of ELF targets.
  7242. /// The sequence depends on the maximum TLS area size.
  7243. SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
  7244. SDValue ThreadBase,
  7245. const SDLoc &DL,
  7246. SelectionDAG &DAG) const {
  7247. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  7248. SDValue TPOff, Addr;
  7249. switch (DAG.getTarget().Options.TLSSize) {
  7250. default:
  7251. llvm_unreachable("Unexpected TLS size");
  7252. case 12: {
  7253. // mrs x0, TPIDR_EL0
  7254. // add x0, x0, :tprel_lo12:a
  7255. SDValue Var = DAG.getTargetGlobalAddress(
  7256. GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
  7257. return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
  7258. Var,
  7259. DAG.getTargetConstant(0, DL, MVT::i32)),
  7260. 0);
  7261. }
  7262. case 24: {
  7263. // mrs x0, TPIDR_EL0
  7264. // add x0, x0, :tprel_hi12:a
  7265. // add x0, x0, :tprel_lo12_nc:a
  7266. SDValue HiVar = DAG.getTargetGlobalAddress(
  7267. GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
  7268. SDValue LoVar = DAG.getTargetGlobalAddress(
  7269. GV, DL, PtrVT, 0,
  7270. AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
  7271. Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
  7272. HiVar,
  7273. DAG.getTargetConstant(0, DL, MVT::i32)),
  7274. 0);
  7275. return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
  7276. LoVar,
  7277. DAG.getTargetConstant(0, DL, MVT::i32)),
  7278. 0);
  7279. }
  7280. case 32: {
  7281. // mrs x1, TPIDR_EL0
  7282. // movz x0, #:tprel_g1:a
  7283. // movk x0, #:tprel_g0_nc:a
  7284. // add x0, x1, x0
  7285. SDValue HiVar = DAG.getTargetGlobalAddress(
  7286. GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
  7287. SDValue LoVar = DAG.getTargetGlobalAddress(
  7288. GV, DL, PtrVT, 0,
  7289. AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
  7290. TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
  7291. DAG.getTargetConstant(16, DL, MVT::i32)),
  7292. 0);
  7293. TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
  7294. DAG.getTargetConstant(0, DL, MVT::i32)),
  7295. 0);
  7296. return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
  7297. }
  7298. case 48: {
  7299. // mrs x1, TPIDR_EL0
  7300. // movz x0, #:tprel_g2:a
  7301. // movk x0, #:tprel_g1_nc:a
  7302. // movk x0, #:tprel_g0_nc:a
  7303. // add x0, x1, x0
  7304. SDValue HiVar = DAG.getTargetGlobalAddress(
  7305. GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
  7306. SDValue MiVar = DAG.getTargetGlobalAddress(
  7307. GV, DL, PtrVT, 0,
  7308. AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC);
  7309. SDValue LoVar = DAG.getTargetGlobalAddress(
  7310. GV, DL, PtrVT, 0,
  7311. AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
  7312. TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
  7313. DAG.getTargetConstant(32, DL, MVT::i32)),
  7314. 0);
  7315. TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
  7316. DAG.getTargetConstant(16, DL, MVT::i32)),
  7317. 0);
  7318. TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
  7319. DAG.getTargetConstant(0, DL, MVT::i32)),
  7320. 0);
  7321. return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
  7322. }
  7323. }
  7324. }
  7325. /// When accessing thread-local variables under either the general-dynamic or
  7326. /// local-dynamic system, we make a "TLS-descriptor" call. The variable will
  7327. /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
  7328. /// is a function pointer to carry out the resolution.
  7329. ///
  7330. /// The sequence is:
  7331. /// adrp x0, :tlsdesc:var
  7332. /// ldr x1, [x0, #:tlsdesc_lo12:var]
  7333. /// add x0, x0, #:tlsdesc_lo12:var
  7334. /// .tlsdesccall var
  7335. /// blr x1
  7336. /// (TPIDR_EL0 offset now in x0)
  7337. ///
  7338. /// The above sequence must be produced unscheduled, to enable the linker to
  7339. /// optimize/relax this sequence.
  7340. /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
  7341. /// above sequence, and expanded really late in the compilation flow, to ensure
  7342. /// the sequence is produced as per above.
  7343. SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
  7344. const SDLoc &DL,
  7345. SelectionDAG &DAG) const {
  7346. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  7347. SDValue Chain = DAG.getEntryNode();
  7348. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
  7349. Chain =
  7350. DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
  7351. SDValue Glue = Chain.getValue(1);
  7352. return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
  7353. }
  7354. SDValue
  7355. AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
  7356. SelectionDAG &DAG) const {
  7357. assert(Subtarget->isTargetELF() && "This function expects an ELF target");
  7358. const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
  7359. TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
  7360. if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
  7361. if (Model == TLSModel::LocalDynamic)
  7362. Model = TLSModel::GeneralDynamic;
  7363. }
  7364. if (getTargetMachine().getCodeModel() == CodeModel::Large &&
  7365. Model != TLSModel::LocalExec)
  7366. report_fatal_error("ELF TLS only supported in small memory model or "
  7367. "in local exec TLS model");
  7368. // Different choices can be made for the maximum size of the TLS area for a
  7369. // module. For the small address model, the default TLS size is 16MiB and the
  7370. // maximum TLS size is 4GiB.
  7371. // FIXME: add tiny and large code model support for TLS access models other
  7372. // than local exec. We currently generate the same code as small for tiny,
  7373. // which may be larger than needed.
  7374. SDValue TPOff;
  7375. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  7376. SDLoc DL(Op);
  7377. const GlobalValue *GV = GA->getGlobal();
  7378. SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
  7379. if (Model == TLSModel::LocalExec) {
  7380. return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
  7381. } else if (Model == TLSModel::InitialExec) {
  7382. TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
  7383. TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
  7384. } else if (Model == TLSModel::LocalDynamic) {
  7385. // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
  7386. // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
  7387. // the beginning of the module's TLS region, followed by a DTPREL offset
  7388. // calculation.
  7389. // These accesses will need deduplicating if there's more than one.
  7390. AArch64FunctionInfo *MFI =
  7391. DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
  7392. MFI->incNumLocalDynamicTLSAccesses();
  7393. // The call needs a relocation too for linker relaxation. It doesn't make
  7394. // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
  7395. // the address.
  7396. SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
  7397. AArch64II::MO_TLS);
  7398. // Now we can calculate the offset from TPIDR_EL0 to this module's
  7399. // thread-local area.
  7400. TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
  7401. // Now use :dtprel_whatever: operations to calculate this variable's offset
  7402. // in its thread-storage area.
  7403. SDValue HiVar = DAG.getTargetGlobalAddress(
  7404. GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
  7405. SDValue LoVar = DAG.getTargetGlobalAddress(
  7406. GV, DL, MVT::i64, 0,
  7407. AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
  7408. TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
  7409. DAG.getTargetConstant(0, DL, MVT::i32)),
  7410. 0);
  7411. TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
  7412. DAG.getTargetConstant(0, DL, MVT::i32)),
  7413. 0);
  7414. } else if (Model == TLSModel::GeneralDynamic) {
  7415. // The call needs a relocation too for linker relaxation. It doesn't make
  7416. // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
  7417. // the address.
  7418. SDValue SymAddr =
  7419. DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
  7420. // Finally we can make a call to calculate the offset from tpidr_el0.
  7421. TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
  7422. } else
  7423. llvm_unreachable("Unsupported ELF TLS access model");
  7424. return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
  7425. }
  7426. SDValue
  7427. AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
  7428. SelectionDAG &DAG) const {
  7429. assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
  7430. SDValue Chain = DAG.getEntryNode();
  7431. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  7432. SDLoc DL(Op);
  7433. SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
  7434. // Load the ThreadLocalStoragePointer from the TEB
  7435. // A pointer to the TLS array is located at offset 0x58 from the TEB.
  7436. SDValue TLSArray =
  7437. DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
  7438. TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
  7439. Chain = TLSArray.getValue(1);
  7440. // Load the TLS index from the C runtime;
  7441. // This does the same as getAddr(), but without having a GlobalAddressSDNode.
  7442. // This also does the same as LOADgot, but using a generic i32 load,
  7443. // while LOADgot only loads i64.
  7444. SDValue TLSIndexHi =
  7445. DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
  7446. SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
  7447. "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
  7448. SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
  7449. SDValue TLSIndex =
  7450. DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
  7451. TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
  7452. Chain = TLSIndex.getValue(1);
  7453. // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
  7454. // offset into the TLSArray.
  7455. TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
  7456. SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
  7457. DAG.getConstant(3, DL, PtrVT));
  7458. SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
  7459. DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
  7460. MachinePointerInfo());
  7461. Chain = TLS.getValue(1);
  7462. const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
  7463. const GlobalValue *GV = GA->getGlobal();
  7464. SDValue TGAHi = DAG.getTargetGlobalAddress(
  7465. GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
  7466. SDValue TGALo = DAG.getTargetGlobalAddress(
  7467. GV, DL, PtrVT, 0,
  7468. AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
  7469. // Add the offset from the start of the .tls section (section base).
  7470. SDValue Addr =
  7471. SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
  7472. DAG.getTargetConstant(0, DL, MVT::i32)),
  7473. 0);
  7474. Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
  7475. return Addr;
  7476. }
  7477. SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
  7478. SelectionDAG &DAG) const {
  7479. const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
  7480. if (DAG.getTarget().useEmulatedTLS())
  7481. return LowerToTLSEmulatedModel(GA, DAG);
  7482. if (Subtarget->isTargetDarwin())
  7483. return LowerDarwinGlobalTLSAddress(Op, DAG);
  7484. if (Subtarget->isTargetELF())
  7485. return LowerELFGlobalTLSAddress(Op, DAG);
  7486. if (Subtarget->isTargetWindows())
  7487. return LowerWindowsGlobalTLSAddress(Op, DAG);
  7488. llvm_unreachable("Unexpected platform trying to use TLS");
  7489. }
  7490. // Looks through \param Val to determine the bit that can be used to
  7491. // check the sign of the value. It returns the unextended value and
  7492. // the sign bit position.
  7493. std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
  7494. if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
  7495. return {Val.getOperand(0),
  7496. cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
  7497. 1};
  7498. if (Val.getOpcode() == ISD::SIGN_EXTEND)
  7499. return {Val.getOperand(0),
  7500. Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
  7501. return {Val, Val.getValueSizeInBits() - 1};
  7502. }
  7503. SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
  7504. SDValue Chain = Op.getOperand(0);
  7505. ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
  7506. SDValue LHS = Op.getOperand(2);
  7507. SDValue RHS = Op.getOperand(3);
  7508. SDValue Dest = Op.getOperand(4);
  7509. SDLoc dl(Op);
  7510. MachineFunction &MF = DAG.getMachineFunction();
  7511. // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
  7512. // will not be produced, as they are conditional branch instructions that do
  7513. // not set flags.
  7514. bool ProduceNonFlagSettingCondBr =
  7515. !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
  7516. // Handle f128 first, since lowering it will result in comparing the return
  7517. // value of a libcall against zero, which is just what the rest of LowerBR_CC
  7518. // is expecting to deal with.
  7519. if (LHS.getValueType() == MVT::f128) {
  7520. softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
  7521. // If softenSetCCOperands returned a scalar, we need to compare the result
  7522. // against zero to select between true and false values.
  7523. if (!RHS.getNode()) {
  7524. RHS = DAG.getConstant(0, dl, LHS.getValueType());
  7525. CC = ISD::SETNE;
  7526. }
  7527. }
  7528. // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
  7529. // instruction.
  7530. if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
  7531. (CC == ISD::SETEQ || CC == ISD::SETNE)) {
  7532. // Only lower legal XALUO ops.
  7533. if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
  7534. return SDValue();
  7535. // The actual operation with overflow check.
  7536. AArch64CC::CondCode OFCC;
  7537. SDValue Value, Overflow;
  7538. std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
  7539. if (CC == ISD::SETNE)
  7540. OFCC = getInvertedCondCode(OFCC);
  7541. SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
  7542. return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
  7543. Overflow);
  7544. }
  7545. if (LHS.getValueType().isInteger()) {
  7546. assert((LHS.getValueType() == RHS.getValueType()) &&
  7547. (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
  7548. // If the RHS of the comparison is zero, we can potentially fold this
  7549. // to a specialized branch.
  7550. const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
  7551. if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
  7552. if (CC == ISD::SETEQ) {
  7553. // See if we can use a TBZ to fold in an AND as well.
  7554. // TBZ has a smaller branch displacement than CBZ. If the offset is
  7555. // out of bounds, a late MI-layer pass rewrites branches.
  7556. // 403.gcc is an example that hits this case.
  7557. if (LHS.getOpcode() == ISD::AND &&
  7558. isa<ConstantSDNode>(LHS.getOperand(1)) &&
  7559. isPowerOf2_64(LHS.getConstantOperandVal(1))) {
  7560. SDValue Test = LHS.getOperand(0);
  7561. uint64_t Mask = LHS.getConstantOperandVal(1);
  7562. return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
  7563. DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
  7564. Dest);
  7565. }
  7566. return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
  7567. } else if (CC == ISD::SETNE) {
  7568. // See if we can use a TBZ to fold in an AND as well.
  7569. // TBZ has a smaller branch displacement than CBZ. If the offset is
  7570. // out of bounds, a late MI-layer pass rewrites branches.
  7571. // 403.gcc is an example that hits this case.
  7572. if (LHS.getOpcode() == ISD::AND &&
  7573. isa<ConstantSDNode>(LHS.getOperand(1)) &&
  7574. isPowerOf2_64(LHS.getConstantOperandVal(1))) {
  7575. SDValue Test = LHS.getOperand(0);
  7576. uint64_t Mask = LHS.getConstantOperandVal(1);
  7577. return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
  7578. DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
  7579. Dest);
  7580. }
  7581. return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
  7582. } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
  7583. // Don't combine AND since emitComparison converts the AND to an ANDS
  7584. // (a.k.a. TST) and the test in the test bit and branch instruction
  7585. // becomes redundant. This would also increase register pressure.
  7586. uint64_t SignBitPos;
  7587. std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
  7588. return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
  7589. DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
  7590. }
  7591. }
  7592. if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
  7593. LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
  7594. // Don't combine AND since emitComparison converts the AND to an ANDS
  7595. // (a.k.a. TST) and the test in the test bit and branch instruction
  7596. // becomes redundant. This would also increase register pressure.
  7597. uint64_t SignBitPos;
  7598. std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
  7599. return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
  7600. DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
  7601. }
  7602. SDValue CCVal;
  7603. SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
  7604. return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
  7605. Cmp);
  7606. }
  7607. assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
  7608. LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
  7609. // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
  7610. // clean. Some of them require two branches to implement.
  7611. SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
  7612. AArch64CC::CondCode CC1, CC2;
  7613. changeFPCCToAArch64CC(CC, CC1, CC2);
  7614. SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
  7615. SDValue BR1 =
  7616. DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
  7617. if (CC2 != AArch64CC::AL) {
  7618. SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
  7619. return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
  7620. Cmp);
  7621. }
  7622. return BR1;
  7623. }
  7624. SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
  7625. SelectionDAG &DAG) const {
  7626. if (!Subtarget->hasNEON())
  7627. return SDValue();
  7628. EVT VT = Op.getValueType();
  7629. EVT IntVT = VT.changeTypeToInteger();
  7630. SDLoc DL(Op);
  7631. SDValue In1 = Op.getOperand(0);
  7632. SDValue In2 = Op.getOperand(1);
  7633. EVT SrcVT = In2.getValueType();
  7634. if (!SrcVT.bitsEq(VT))
  7635. In2 = DAG.getFPExtendOrRound(In2, DL, VT);
  7636. if (VT.isScalableVector())
  7637. IntVT =
  7638. getPackedSVEVectorVT(VT.getVectorElementType().changeTypeToInteger());
  7639. if (VT.isFixedLengthVector() &&
  7640. useSVEForFixedLengthVectorVT(VT, Subtarget->forceStreamingCompatibleSVE())) {
  7641. EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
  7642. In1 = convertToScalableVector(DAG, ContainerVT, In1);
  7643. In2 = convertToScalableVector(DAG, ContainerVT, In2);
  7644. SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
  7645. return convertFromScalableVector(DAG, VT, Res);
  7646. }
  7647. auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
  7648. if (VT.isScalableVector())
  7649. return getSVESafeBitCast(VT, Op, DAG);
  7650. return DAG.getBitcast(VT, Op);
  7651. };
  7652. SDValue VecVal1, VecVal2;
  7653. EVT VecVT;
  7654. auto SetVecVal = [&](int Idx = -1) {
  7655. if (!VT.isVector()) {
  7656. VecVal1 =
  7657. DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
  7658. VecVal2 =
  7659. DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
  7660. } else {
  7661. VecVal1 = BitCast(VecVT, In1, DAG);
  7662. VecVal2 = BitCast(VecVT, In2, DAG);
  7663. }
  7664. };
  7665. if (VT.isVector()) {
  7666. VecVT = IntVT;
  7667. SetVecVal();
  7668. } else if (VT == MVT::f64) {
  7669. VecVT = MVT::v2i64;
  7670. SetVecVal(AArch64::dsub);
  7671. } else if (VT == MVT::f32) {
  7672. VecVT = MVT::v4i32;
  7673. SetVecVal(AArch64::ssub);
  7674. } else if (VT == MVT::f16) {
  7675. VecVT = MVT::v8i16;
  7676. SetVecVal(AArch64::hsub);
  7677. } else {
  7678. llvm_unreachable("Invalid type for copysign!");
  7679. }
  7680. unsigned BitWidth = In1.getScalarValueSizeInBits();
  7681. SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
  7682. // We want to materialize a mask with every bit but the high bit set, but the
  7683. // AdvSIMD immediate moves cannot materialize that in a single instruction for
  7684. // 64-bit elements. Instead, materialize all bits set and then negate that.
  7685. if (VT == MVT::f64 || VT == MVT::v2f64) {
  7686. SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
  7687. SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
  7688. SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
  7689. SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
  7690. }
  7691. SDValue BSP =
  7692. DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
  7693. if (VT == MVT::f16)
  7694. return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
  7695. if (VT == MVT::f32)
  7696. return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
  7697. if (VT == MVT::f64)
  7698. return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
  7699. return BitCast(VT, BSP, DAG);
  7700. }
  7701. SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
  7702. SelectionDAG &DAG) const {
  7703. if (DAG.getMachineFunction().getFunction().hasFnAttribute(
  7704. Attribute::NoImplicitFloat))
  7705. return SDValue();
  7706. if (!Subtarget->hasNEON())
  7707. return SDValue();
  7708. bool IsParity = Op.getOpcode() == ISD::PARITY;
  7709. SDValue Val = Op.getOperand(0);
  7710. SDLoc DL(Op);
  7711. EVT VT = Op.getValueType();
  7712. // for i32, general parity function using EORs is more efficient compared to
  7713. // using floating point
  7714. if (VT == MVT::i32 && IsParity)
  7715. return SDValue();
  7716. // If there is no CNT instruction available, GPR popcount can
  7717. // be more efficiently lowered to the following sequence that uses
  7718. // AdvSIMD registers/instructions as long as the copies to/from
  7719. // the AdvSIMD registers are cheap.
  7720. // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
  7721. // CNT V0.8B, V0.8B // 8xbyte pop-counts
  7722. // ADDV B0, V0.8B // sum 8xbyte pop-counts
  7723. // UMOV X0, V0.B[0] // copy byte result back to integer reg
  7724. if (VT == MVT::i32 || VT == MVT::i64) {
  7725. if (VT == MVT::i32)
  7726. Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
  7727. Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
  7728. SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
  7729. SDValue UaddLV = DAG.getNode(
  7730. ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
  7731. DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
  7732. if (IsParity)
  7733. UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
  7734. DAG.getConstant(1, DL, MVT::i32));
  7735. if (VT == MVT::i64)
  7736. UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
  7737. return UaddLV;
  7738. } else if (VT == MVT::i128) {
  7739. Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
  7740. SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
  7741. SDValue UaddLV = DAG.getNode(
  7742. ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
  7743. DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
  7744. if (IsParity)
  7745. UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
  7746. DAG.getConstant(1, DL, MVT::i32));
  7747. return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
  7748. }
  7749. assert(!IsParity && "ISD::PARITY of vector types not supported");
  7750. if (VT.isScalableVector() ||
  7751. useSVEForFixedLengthVectorVT(VT,
  7752. Subtarget->forceStreamingCompatibleSVE()))
  7753. return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
  7754. assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
  7755. VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
  7756. "Unexpected type for custom ctpop lowering");
  7757. EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
  7758. Val = DAG.getBitcast(VT8Bit, Val);
  7759. Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
  7760. // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
  7761. unsigned EltSize = 8;
  7762. unsigned NumElts = VT.is64BitVector() ? 8 : 16;
  7763. while (EltSize != VT.getScalarSizeInBits()) {
  7764. EltSize *= 2;
  7765. NumElts /= 2;
  7766. MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
  7767. Val = DAG.getNode(
  7768. ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
  7769. DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
  7770. }
  7771. return Val;
  7772. }
  7773. SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
  7774. EVT VT = Op.getValueType();
  7775. assert(VT.isScalableVector() ||
  7776. useSVEForFixedLengthVectorVT(
  7777. VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
  7778. SDLoc DL(Op);
  7779. SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
  7780. return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
  7781. }
  7782. SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
  7783. SelectionDAG &DAG) const {
  7784. EVT VT = Op.getValueType();
  7785. SDLoc DL(Op);
  7786. unsigned Opcode = Op.getOpcode();
  7787. ISD::CondCode CC;
  7788. switch (Opcode) {
  7789. default:
  7790. llvm_unreachable("Wrong instruction");
  7791. case ISD::SMAX:
  7792. CC = ISD::SETGT;
  7793. break;
  7794. case ISD::SMIN:
  7795. CC = ISD::SETLT;
  7796. break;
  7797. case ISD::UMAX:
  7798. CC = ISD::SETUGT;
  7799. break;
  7800. case ISD::UMIN:
  7801. CC = ISD::SETULT;
  7802. break;
  7803. }
  7804. if (VT.isScalableVector() ||
  7805. useSVEForFixedLengthVectorVT(
  7806. VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
  7807. switch (Opcode) {
  7808. default:
  7809. llvm_unreachable("Wrong instruction");
  7810. case ISD::SMAX:
  7811. return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
  7812. case ISD::SMIN:
  7813. return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
  7814. case ISD::UMAX:
  7815. return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
  7816. case ISD::UMIN:
  7817. return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
  7818. }
  7819. }
  7820. SDValue Op0 = Op.getOperand(0);
  7821. SDValue Op1 = Op.getOperand(1);
  7822. SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
  7823. return DAG.getSelect(DL, VT, Cond, Op0, Op1);
  7824. }
  7825. SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
  7826. SelectionDAG &DAG) const {
  7827. EVT VT = Op.getValueType();
  7828. if (VT.isScalableVector() ||
  7829. useSVEForFixedLengthVectorVT(
  7830. VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
  7831. return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
  7832. SDLoc DL(Op);
  7833. SDValue REVB;
  7834. MVT VST;
  7835. switch (VT.getSimpleVT().SimpleTy) {
  7836. default:
  7837. llvm_unreachable("Invalid type for bitreverse!");
  7838. case MVT::v2i32: {
  7839. VST = MVT::v8i8;
  7840. REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
  7841. break;
  7842. }
  7843. case MVT::v4i32: {
  7844. VST = MVT::v16i8;
  7845. REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
  7846. break;
  7847. }
  7848. case MVT::v1i64: {
  7849. VST = MVT::v8i8;
  7850. REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
  7851. break;
  7852. }
  7853. case MVT::v2i64: {
  7854. VST = MVT::v16i8;
  7855. REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
  7856. break;
  7857. }
  7858. }
  7859. return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
  7860. DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
  7861. }
  7862. // Check whether the continuous comparison sequence.
  7863. static bool
  7864. isOrXorChain(SDValue N, unsigned &Num,
  7865. SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
  7866. if (Num == MaxXors)
  7867. return false;
  7868. // Skip the one-use zext
  7869. if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
  7870. N = N->getOperand(0);
  7871. // The leaf node must be XOR
  7872. if (N->getOpcode() == ISD::XOR) {
  7873. WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
  7874. Num++;
  7875. return true;
  7876. }
  7877. // All the non-leaf nodes must be OR.
  7878. if (N->getOpcode() != ISD::OR || !N->hasOneUse())
  7879. return false;
  7880. if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
  7881. isOrXorChain(N->getOperand(1), Num, WorkList))
  7882. return true;
  7883. return false;
  7884. }
  7885. // Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
  7886. static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG) {
  7887. SDValue LHS = N->getOperand(0);
  7888. SDValue RHS = N->getOperand(1);
  7889. SDLoc DL(N);
  7890. EVT VT = N->getValueType(0);
  7891. SmallVector<std::pair<SDValue, SDValue>, 16> WorkList;
  7892. // Only handle integer compares.
  7893. if (N->getOpcode() != ISD::SETCC)
  7894. return SDValue();
  7895. ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
  7896. // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
  7897. // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
  7898. unsigned NumXors = 0;
  7899. if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
  7900. LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
  7901. isOrXorChain(LHS, NumXors, WorkList)) {
  7902. SDValue XOR0, XOR1;
  7903. std::tie(XOR0, XOR1) = WorkList[0];
  7904. unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
  7905. SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
  7906. for (unsigned I = 1; I < WorkList.size(); I++) {
  7907. std::tie(XOR0, XOR1) = WorkList[I];
  7908. SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
  7909. Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
  7910. }
  7911. // Exit early by inverting the condition, which help reduce indentations.
  7912. return Cmp;
  7913. }
  7914. return SDValue();
  7915. }
  7916. SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
  7917. if (Op.getValueType().isVector())
  7918. return LowerVSETCC(Op, DAG);
  7919. bool IsStrict = Op->isStrictFPOpcode();
  7920. bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
  7921. unsigned OpNo = IsStrict ? 1 : 0;
  7922. SDValue Chain;
  7923. if (IsStrict)
  7924. Chain = Op.getOperand(0);
  7925. SDValue LHS = Op.getOperand(OpNo + 0);
  7926. SDValue RHS = Op.getOperand(OpNo + 1);
  7927. ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
  7928. SDLoc dl(Op);
  7929. // We chose ZeroOrOneBooleanContents, so use zero and one.
  7930. EVT VT = Op.getValueType();
  7931. SDValue TVal = DAG.getConstant(1, dl, VT);
  7932. SDValue FVal = DAG.getConstant(0, dl, VT);
  7933. // Handle f128 first, since one possible outcome is a normal integer
  7934. // comparison which gets picked up by the next if statement.
  7935. if (LHS.getValueType() == MVT::f128) {
  7936. softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
  7937. IsSignaling);
  7938. // If softenSetCCOperands returned a scalar, use it.
  7939. if (!RHS.getNode()) {
  7940. assert(LHS.getValueType() == Op.getValueType() &&
  7941. "Unexpected setcc expansion!");
  7942. return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
  7943. }
  7944. }
  7945. if (LHS.getValueType().isInteger()) {
  7946. SDValue CCVal;
  7947. SDValue Cmp = getAArch64Cmp(
  7948. LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
  7949. // Note that we inverted the condition above, so we reverse the order of
  7950. // the true and false operands here. This will allow the setcc to be
  7951. // matched to a single CSINC instruction.
  7952. SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
  7953. return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
  7954. }
  7955. // Now we know we're dealing with FP values.
  7956. assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
  7957. LHS.getValueType() == MVT::f64);
  7958. // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
  7959. // and do the comparison.
  7960. SDValue Cmp;
  7961. if (IsStrict)
  7962. Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
  7963. else
  7964. Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
  7965. AArch64CC::CondCode CC1, CC2;
  7966. changeFPCCToAArch64CC(CC, CC1, CC2);
  7967. SDValue Res;
  7968. if (CC2 == AArch64CC::AL) {
  7969. changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
  7970. CC2);
  7971. SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
  7972. // Note that we inverted the condition above, so we reverse the order of
  7973. // the true and false operands here. This will allow the setcc to be
  7974. // matched to a single CSINC instruction.
  7975. Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
  7976. } else {
  7977. // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
  7978. // totally clean. Some of them require two CSELs to implement. As is in
  7979. // this case, we emit the first CSEL and then emit a second using the output
  7980. // of the first as the RHS. We're effectively OR'ing the two CC's together.
  7981. // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
  7982. SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
  7983. SDValue CS1 =
  7984. DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
  7985. SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
  7986. Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
  7987. }
  7988. return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
  7989. }
  7990. SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
  7991. SelectionDAG &DAG) const {
  7992. SDValue LHS = Op.getOperand(0);
  7993. SDValue RHS = Op.getOperand(1);
  7994. EVT VT = LHS.getValueType();
  7995. if (VT != MVT::i32 && VT != MVT::i64)
  7996. return SDValue();
  7997. SDLoc DL(Op);
  7998. SDValue Carry = Op.getOperand(2);
  7999. // SBCS uses a carry not a borrow so the carry flag should be inverted first.
  8000. SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
  8001. SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
  8002. LHS, RHS, InvCarry);
  8003. EVT OpVT = Op.getValueType();
  8004. SDValue TVal = DAG.getConstant(1, DL, OpVT);
  8005. SDValue FVal = DAG.getConstant(0, DL, OpVT);
  8006. ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
  8007. ISD::CondCode CondInv = ISD::getSetCCInverse(Cond, VT);
  8008. SDValue CCVal =
  8009. DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
  8010. // Inputs are swapped because the condition is inverted. This will allow
  8011. // matching with a single CSINC instruction.
  8012. return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
  8013. Cmp.getValue(1));
  8014. }
  8015. SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
  8016. SDValue RHS, SDValue TVal,
  8017. SDValue FVal, const SDLoc &dl,
  8018. SelectionDAG &DAG) const {
  8019. // Handle f128 first, because it will result in a comparison of some RTLIB
  8020. // call result against zero.
  8021. if (LHS.getValueType() == MVT::f128) {
  8022. softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
  8023. // If softenSetCCOperands returned a scalar, we need to compare the result
  8024. // against zero to select between true and false values.
  8025. if (!RHS.getNode()) {
  8026. RHS = DAG.getConstant(0, dl, LHS.getValueType());
  8027. CC = ISD::SETNE;
  8028. }
  8029. }
  8030. // Also handle f16, for which we need to do a f32 comparison.
  8031. if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
  8032. LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
  8033. RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
  8034. }
  8035. // Next, handle integers.
  8036. if (LHS.getValueType().isInteger()) {
  8037. assert((LHS.getValueType() == RHS.getValueType()) &&
  8038. (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
  8039. ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
  8040. ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
  8041. ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
  8042. // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
  8043. // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
  8044. // supported types.
  8045. if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
  8046. CTVal->isOne() && CFVal->isAllOnes() &&
  8047. LHS.getValueType() == TVal.getValueType()) {
  8048. EVT VT = LHS.getValueType();
  8049. SDValue Shift =
  8050. DAG.getNode(ISD::SRA, dl, VT, LHS,
  8051. DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
  8052. return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
  8053. }
  8054. unsigned Opcode = AArch64ISD::CSEL;
  8055. // If both the TVal and the FVal are constants, see if we can swap them in
  8056. // order to for a CSINV or CSINC out of them.
  8057. if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
  8058. std::swap(TVal, FVal);
  8059. std::swap(CTVal, CFVal);
  8060. CC = ISD::getSetCCInverse(CC, LHS.getValueType());
  8061. } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
  8062. std::swap(TVal, FVal);
  8063. std::swap(CTVal, CFVal);
  8064. CC = ISD::getSetCCInverse(CC, LHS.getValueType());
  8065. } else if (TVal.getOpcode() == ISD::XOR) {
  8066. // If TVal is a NOT we want to swap TVal and FVal so that we can match
  8067. // with a CSINV rather than a CSEL.
  8068. if (isAllOnesConstant(TVal.getOperand(1))) {
  8069. std::swap(TVal, FVal);
  8070. std::swap(CTVal, CFVal);
  8071. CC = ISD::getSetCCInverse(CC, LHS.getValueType());
  8072. }
  8073. } else if (TVal.getOpcode() == ISD::SUB) {
  8074. // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
  8075. // that we can match with a CSNEG rather than a CSEL.
  8076. if (isNullConstant(TVal.getOperand(0))) {
  8077. std::swap(TVal, FVal);
  8078. std::swap(CTVal, CFVal);
  8079. CC = ISD::getSetCCInverse(CC, LHS.getValueType());
  8080. }
  8081. } else if (CTVal && CFVal) {
  8082. const int64_t TrueVal = CTVal->getSExtValue();
  8083. const int64_t FalseVal = CFVal->getSExtValue();
  8084. bool Swap = false;
  8085. // If both TVal and FVal are constants, see if FVal is the
  8086. // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
  8087. // instead of a CSEL in that case.
  8088. if (TrueVal == ~FalseVal) {
  8089. Opcode = AArch64ISD::CSINV;
  8090. } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
  8091. TrueVal == -FalseVal) {
  8092. Opcode = AArch64ISD::CSNEG;
  8093. } else if (TVal.getValueType() == MVT::i32) {
  8094. // If our operands are only 32-bit wide, make sure we use 32-bit
  8095. // arithmetic for the check whether we can use CSINC. This ensures that
  8096. // the addition in the check will wrap around properly in case there is
  8097. // an overflow (which would not be the case if we do the check with
  8098. // 64-bit arithmetic).
  8099. const uint32_t TrueVal32 = CTVal->getZExtValue();
  8100. const uint32_t FalseVal32 = CFVal->getZExtValue();
  8101. if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
  8102. Opcode = AArch64ISD::CSINC;
  8103. if (TrueVal32 > FalseVal32) {
  8104. Swap = true;
  8105. }
  8106. }
  8107. } else {
  8108. // 64-bit check whether we can use CSINC.
  8109. const uint64_t TrueVal64 = TrueVal;
  8110. const uint64_t FalseVal64 = FalseVal;
  8111. if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
  8112. Opcode = AArch64ISD::CSINC;
  8113. if (TrueVal > FalseVal) {
  8114. Swap = true;
  8115. }
  8116. }
  8117. }
  8118. // Swap TVal and FVal if necessary.
  8119. if (Swap) {
  8120. std::swap(TVal, FVal);
  8121. std::swap(CTVal, CFVal);
  8122. CC = ISD::getSetCCInverse(CC, LHS.getValueType());
  8123. }
  8124. if (Opcode != AArch64ISD::CSEL) {
  8125. // Drop FVal since we can get its value by simply inverting/negating
  8126. // TVal.
  8127. FVal = TVal;
  8128. }
  8129. }
  8130. // Avoid materializing a constant when possible by reusing a known value in
  8131. // a register. However, don't perform this optimization if the known value
  8132. // is one, zero or negative one in the case of a CSEL. We can always
  8133. // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
  8134. // FVal, respectively.
  8135. ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
  8136. if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
  8137. !RHSVal->isZero() && !RHSVal->isAllOnes()) {
  8138. AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
  8139. // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
  8140. // "a != C ? x : a" to avoid materializing C.
  8141. if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
  8142. TVal = LHS;
  8143. else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
  8144. FVal = LHS;
  8145. } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
  8146. assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
  8147. // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
  8148. // avoid materializing C.
  8149. AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
  8150. if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
  8151. Opcode = AArch64ISD::CSINV;
  8152. TVal = LHS;
  8153. FVal = DAG.getConstant(0, dl, FVal.getValueType());
  8154. }
  8155. }
  8156. SDValue CCVal;
  8157. SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
  8158. EVT VT = TVal.getValueType();
  8159. return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
  8160. }
  8161. // Now we know we're dealing with FP values.
  8162. assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
  8163. LHS.getValueType() == MVT::f64);
  8164. assert(LHS.getValueType() == RHS.getValueType());
  8165. EVT VT = TVal.getValueType();
  8166. SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
  8167. // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
  8168. // clean. Some of them require two CSELs to implement.
  8169. AArch64CC::CondCode CC1, CC2;
  8170. changeFPCCToAArch64CC(CC, CC1, CC2);
  8171. if (DAG.getTarget().Options.UnsafeFPMath) {
  8172. // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
  8173. // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
  8174. ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
  8175. if (RHSVal && RHSVal->isZero()) {
  8176. ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
  8177. ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
  8178. if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
  8179. CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
  8180. TVal = LHS;
  8181. else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
  8182. CFVal && CFVal->isZero() &&
  8183. FVal.getValueType() == LHS.getValueType())
  8184. FVal = LHS;
  8185. }
  8186. }
  8187. // Emit first, and possibly only, CSEL.
  8188. SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
  8189. SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
  8190. // If we need a second CSEL, emit it, using the output of the first as the
  8191. // RHS. We're effectively OR'ing the two CC's together.
  8192. if (CC2 != AArch64CC::AL) {
  8193. SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
  8194. return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
  8195. }
  8196. // Otherwise, return the output of the first CSEL.
  8197. return CS1;
  8198. }
  8199. SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
  8200. SelectionDAG &DAG) const {
  8201. EVT Ty = Op.getValueType();
  8202. auto Idx = Op.getConstantOperandAPInt(2);
  8203. int64_t IdxVal = Idx.getSExtValue();
  8204. assert(Ty.isScalableVector() &&
  8205. "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
  8206. // We can use the splice instruction for certain index values where we are
  8207. // able to efficiently generate the correct predicate. The index will be
  8208. // inverted and used directly as the input to the ptrue instruction, i.e.
  8209. // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
  8210. // splice predicate. However, we can only do this if we can guarantee that
  8211. // there are enough elements in the vector, hence we check the index <= min
  8212. // number of elements.
  8213. std::optional<unsigned> PredPattern;
  8214. if (Ty.isScalableVector() && IdxVal < 0 &&
  8215. (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
  8216. std::nullopt) {
  8217. SDLoc DL(Op);
  8218. // Create a predicate where all but the last -IdxVal elements are false.
  8219. EVT PredVT = Ty.changeVectorElementType(MVT::i1);
  8220. SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
  8221. Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
  8222. // Now splice the two inputs together using the predicate.
  8223. return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
  8224. Op.getOperand(1));
  8225. }
  8226. // This will select to an EXT instruction, which has a maximum immediate
  8227. // value of 255, hence 2048-bits is the maximum value we can lower.
  8228. if (IdxVal >= 0 &&
  8229. IdxVal < int64_t(2048 / Ty.getVectorElementType().getSizeInBits()))
  8230. return Op;
  8231. return SDValue();
  8232. }
  8233. SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
  8234. SelectionDAG &DAG) const {
  8235. ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
  8236. SDValue LHS = Op.getOperand(0);
  8237. SDValue RHS = Op.getOperand(1);
  8238. SDValue TVal = Op.getOperand(2);
  8239. SDValue FVal = Op.getOperand(3);
  8240. SDLoc DL(Op);
  8241. return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
  8242. }
  8243. SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
  8244. SelectionDAG &DAG) const {
  8245. SDValue CCVal = Op->getOperand(0);
  8246. SDValue TVal = Op->getOperand(1);
  8247. SDValue FVal = Op->getOperand(2);
  8248. SDLoc DL(Op);
  8249. EVT Ty = Op.getValueType();
  8250. if (Ty.isScalableVector()) {
  8251. SDValue TruncCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, CCVal);
  8252. MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
  8253. SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, TruncCC);
  8254. return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
  8255. }
  8256. if (useSVEForFixedLengthVectorVT(Ty)) {
  8257. // FIXME: Ideally this would be the same as above using i1 types, however
  8258. // for the moment we can't deal with fixed i1 vector types properly, so
  8259. // instead extend the predicate to a result type sized integer vector.
  8260. MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
  8261. MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
  8262. SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
  8263. SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
  8264. return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
  8265. }
  8266. // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
  8267. // instruction.
  8268. if (ISD::isOverflowIntrOpRes(CCVal)) {
  8269. // Only lower legal XALUO ops.
  8270. if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
  8271. return SDValue();
  8272. AArch64CC::CondCode OFCC;
  8273. SDValue Value, Overflow;
  8274. std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
  8275. SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
  8276. return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
  8277. CCVal, Overflow);
  8278. }
  8279. // Lower it the same way as we would lower a SELECT_CC node.
  8280. ISD::CondCode CC;
  8281. SDValue LHS, RHS;
  8282. if (CCVal.getOpcode() == ISD::SETCC) {
  8283. LHS = CCVal.getOperand(0);
  8284. RHS = CCVal.getOperand(1);
  8285. CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
  8286. } else {
  8287. LHS = CCVal;
  8288. RHS = DAG.getConstant(0, DL, CCVal.getValueType());
  8289. CC = ISD::SETNE;
  8290. }
  8291. // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
  8292. // order to use FCSELSrrr
  8293. if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
  8294. TVal = SDValue(
  8295. DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
  8296. DAG.getUNDEF(MVT::f32), TVal,
  8297. DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
  8298. 0);
  8299. FVal = SDValue(
  8300. DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
  8301. DAG.getUNDEF(MVT::f32), FVal,
  8302. DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
  8303. 0);
  8304. }
  8305. SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
  8306. if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
  8307. Res = SDValue(
  8308. DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, Ty, Res,
  8309. DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
  8310. 0);
  8311. }
  8312. return Res;
  8313. }
  8314. SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
  8315. SelectionDAG &DAG) const {
  8316. // Jump table entries as PC relative offsets. No additional tweaking
  8317. // is necessary here. Just get the address of the jump table.
  8318. JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
  8319. if (getTargetMachine().getCodeModel() == CodeModel::Large &&
  8320. !Subtarget->isTargetMachO()) {
  8321. return getAddrLarge(JT, DAG);
  8322. } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
  8323. return getAddrTiny(JT, DAG);
  8324. }
  8325. return getAddr(JT, DAG);
  8326. }
  8327. SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
  8328. SelectionDAG &DAG) const {
  8329. // Jump table entries as PC relative offsets. No additional tweaking
  8330. // is necessary here. Just get the address of the jump table.
  8331. SDLoc DL(Op);
  8332. SDValue JT = Op.getOperand(1);
  8333. SDValue Entry = Op.getOperand(2);
  8334. int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
  8335. auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
  8336. AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
  8337. SDNode *Dest =
  8338. DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
  8339. Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
  8340. return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0),
  8341. SDValue(Dest, 0));
  8342. }
  8343. SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
  8344. SelectionDAG &DAG) const {
  8345. ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
  8346. if (getTargetMachine().getCodeModel() == CodeModel::Large) {
  8347. // Use the GOT for the large code model on iOS.
  8348. if (Subtarget->isTargetMachO()) {
  8349. return getGOT(CP, DAG);
  8350. }
  8351. return getAddrLarge(CP, DAG);
  8352. } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
  8353. return getAddrTiny(CP, DAG);
  8354. } else {
  8355. return getAddr(CP, DAG);
  8356. }
  8357. }
  8358. SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
  8359. SelectionDAG &DAG) const {
  8360. BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
  8361. if (getTargetMachine().getCodeModel() == CodeModel::Large &&
  8362. !Subtarget->isTargetMachO()) {
  8363. return getAddrLarge(BA, DAG);
  8364. } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
  8365. return getAddrTiny(BA, DAG);
  8366. }
  8367. return getAddr(BA, DAG);
  8368. }
  8369. SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
  8370. SelectionDAG &DAG) const {
  8371. AArch64FunctionInfo *FuncInfo =
  8372. DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
  8373. SDLoc DL(Op);
  8374. SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
  8375. getPointerTy(DAG.getDataLayout()));
  8376. FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
  8377. const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
  8378. return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
  8379. MachinePointerInfo(SV));
  8380. }
  8381. SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
  8382. SelectionDAG &DAG) const {
  8383. MachineFunction &MF = DAG.getMachineFunction();
  8384. AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
  8385. SDLoc DL(Op);
  8386. SDValue FR;
  8387. if (Subtarget->isWindowsArm64EC()) {
  8388. // With the Arm64EC ABI, we compute the address of the varargs save area
  8389. // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
  8390. // but calls from an entry thunk can pass in a different address.
  8391. Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
  8392. SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
  8393. uint64_t StackOffset;
  8394. if (FuncInfo->getVarArgsGPRSize() > 0)
  8395. StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
  8396. else
  8397. StackOffset = FuncInfo->getVarArgsStackOffset();
  8398. FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
  8399. DAG.getConstant(StackOffset, DL, MVT::i64));
  8400. } else {
  8401. FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
  8402. ? FuncInfo->getVarArgsGPRIndex()
  8403. : FuncInfo->getVarArgsStackIndex(),
  8404. getPointerTy(DAG.getDataLayout()));
  8405. }
  8406. const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
  8407. return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
  8408. MachinePointerInfo(SV));
  8409. }
  8410. SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
  8411. SelectionDAG &DAG) const {
  8412. // The layout of the va_list struct is specified in the AArch64 Procedure Call
  8413. // Standard, section B.3.
  8414. MachineFunction &MF = DAG.getMachineFunction();
  8415. AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
  8416. unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
  8417. auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
  8418. auto PtrVT = getPointerTy(DAG.getDataLayout());
  8419. SDLoc DL(Op);
  8420. SDValue Chain = Op.getOperand(0);
  8421. SDValue VAList = Op.getOperand(1);
  8422. const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
  8423. SmallVector<SDValue, 4> MemOps;
  8424. // void *__stack at offset 0
  8425. unsigned Offset = 0;
  8426. SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
  8427. Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
  8428. MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
  8429. MachinePointerInfo(SV), Align(PtrSize)));
  8430. // void *__gr_top at offset 8 (4 on ILP32)
  8431. Offset += PtrSize;
  8432. int GPRSize = FuncInfo->getVarArgsGPRSize();
  8433. if (GPRSize > 0) {
  8434. SDValue GRTop, GRTopAddr;
  8435. GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
  8436. DAG.getConstant(Offset, DL, PtrVT));
  8437. GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
  8438. GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
  8439. DAG.getConstant(GPRSize, DL, PtrVT));
  8440. GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
  8441. MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
  8442. MachinePointerInfo(SV, Offset),
  8443. Align(PtrSize)));
  8444. }
  8445. // void *__vr_top at offset 16 (8 on ILP32)
  8446. Offset += PtrSize;
  8447. int FPRSize = FuncInfo->getVarArgsFPRSize();
  8448. if (FPRSize > 0) {
  8449. SDValue VRTop, VRTopAddr;
  8450. VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
  8451. DAG.getConstant(Offset, DL, PtrVT));
  8452. VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
  8453. VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
  8454. DAG.getConstant(FPRSize, DL, PtrVT));
  8455. VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
  8456. MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
  8457. MachinePointerInfo(SV, Offset),
  8458. Align(PtrSize)));
  8459. }
  8460. // int __gr_offs at offset 24 (12 on ILP32)
  8461. Offset += PtrSize;
  8462. SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
  8463. DAG.getConstant(Offset, DL, PtrVT));
  8464. MemOps.push_back(
  8465. DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
  8466. GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
  8467. // int __vr_offs at offset 28 (16 on ILP32)
  8468. Offset += 4;
  8469. SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
  8470. DAG.getConstant(Offset, DL, PtrVT));
  8471. MemOps.push_back(
  8472. DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
  8473. VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
  8474. return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
  8475. }
  8476. SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
  8477. SelectionDAG &DAG) const {
  8478. MachineFunction &MF = DAG.getMachineFunction();
  8479. if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
  8480. return LowerWin64_VASTART(Op, DAG);
  8481. else if (Subtarget->isTargetDarwin())
  8482. return LowerDarwin_VASTART(Op, DAG);
  8483. else
  8484. return LowerAAPCS_VASTART(Op, DAG);
  8485. }
  8486. SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
  8487. SelectionDAG &DAG) const {
  8488. // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
  8489. // pointer.
  8490. SDLoc DL(Op);
  8491. unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
  8492. unsigned VaListSize =
  8493. (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
  8494. ? PtrSize
  8495. : Subtarget->isTargetILP32() ? 20 : 32;
  8496. const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
  8497. const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
  8498. return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
  8499. DAG.getConstant(VaListSize, DL, MVT::i32),
  8500. Align(PtrSize), false, false, false,
  8501. MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
  8502. }
  8503. SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
  8504. assert(Subtarget->isTargetDarwin() &&
  8505. "automatic va_arg instruction only works on Darwin");
  8506. const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
  8507. EVT VT = Op.getValueType();
  8508. SDLoc DL(Op);
  8509. SDValue Chain = Op.getOperand(0);
  8510. SDValue Addr = Op.getOperand(1);
  8511. MaybeAlign Align(Op.getConstantOperandVal(3));
  8512. unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
  8513. auto PtrVT = getPointerTy(DAG.getDataLayout());
  8514. auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
  8515. SDValue VAList =
  8516. DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
  8517. Chain = VAList.getValue(1);
  8518. VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
  8519. if (VT.isScalableVector())
  8520. report_fatal_error("Passing SVE types to variadic functions is "
  8521. "currently not supported");
  8522. if (Align && *Align > MinSlotSize) {
  8523. VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
  8524. DAG.getConstant(Align->value() - 1, DL, PtrVT));
  8525. VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
  8526. DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
  8527. }
  8528. Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
  8529. unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
  8530. // Scalar integer and FP values smaller than 64 bits are implicitly extended
  8531. // up to 64 bits. At the very least, we have to increase the striding of the
  8532. // vaargs list to match this, and for FP values we need to introduce
  8533. // FP_ROUND nodes as well.
  8534. if (VT.isInteger() && !VT.isVector())
  8535. ArgSize = std::max(ArgSize, MinSlotSize);
  8536. bool NeedFPTrunc = false;
  8537. if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
  8538. ArgSize = 8;
  8539. NeedFPTrunc = true;
  8540. }
  8541. // Increment the pointer, VAList, to the next vaarg
  8542. SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
  8543. DAG.getConstant(ArgSize, DL, PtrVT));
  8544. VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
  8545. // Store the incremented VAList to the legalized pointer
  8546. SDValue APStore =
  8547. DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
  8548. // Load the actual argument out of the pointer VAList
  8549. if (NeedFPTrunc) {
  8550. // Load the value as an f64.
  8551. SDValue WideFP =
  8552. DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
  8553. // Round the value down to an f32.
  8554. SDValue NarrowFP =
  8555. DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
  8556. DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
  8557. SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
  8558. // Merge the rounded value with the chain output of the load.
  8559. return DAG.getMergeValues(Ops, DL);
  8560. }
  8561. return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
  8562. }
  8563. SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
  8564. SelectionDAG &DAG) const {
  8565. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
  8566. MFI.setFrameAddressIsTaken(true);
  8567. EVT VT = Op.getValueType();
  8568. SDLoc DL(Op);
  8569. unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
  8570. SDValue FrameAddr =
  8571. DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
  8572. while (Depth--)
  8573. FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
  8574. MachinePointerInfo());
  8575. if (Subtarget->isTargetILP32())
  8576. FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
  8577. DAG.getValueType(VT));
  8578. return FrameAddr;
  8579. }
  8580. SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
  8581. SelectionDAG &DAG) const {
  8582. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
  8583. EVT VT = getPointerTy(DAG.getDataLayout());
  8584. SDLoc DL(Op);
  8585. int FI = MFI.CreateFixedObject(4, 0, false);
  8586. return DAG.getFrameIndex(FI, VT);
  8587. }
  8588. #define GET_REGISTER_MATCHER
  8589. #include "AArch64GenAsmMatcher.inc"
  8590. // FIXME? Maybe this could be a TableGen attribute on some registers and
  8591. // this table could be generated automatically from RegInfo.
  8592. Register AArch64TargetLowering::
  8593. getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
  8594. Register Reg = MatchRegisterName(RegName);
  8595. if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
  8596. const MCRegisterInfo *MRI = Subtarget->getRegisterInfo();
  8597. unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
  8598. if (!Subtarget->isXRegisterReserved(DwarfRegNum))
  8599. Reg = 0;
  8600. }
  8601. if (Reg)
  8602. return Reg;
  8603. report_fatal_error(Twine("Invalid register name \""
  8604. + StringRef(RegName) + "\"."));
  8605. }
  8606. SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
  8607. SelectionDAG &DAG) const {
  8608. DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
  8609. EVT VT = Op.getValueType();
  8610. SDLoc DL(Op);
  8611. SDValue FrameAddr =
  8612. DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
  8613. SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
  8614. return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
  8615. }
  8616. SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
  8617. SelectionDAG &DAG) const {
  8618. MachineFunction &MF = DAG.getMachineFunction();
  8619. MachineFrameInfo &MFI = MF.getFrameInfo();
  8620. MFI.setReturnAddressIsTaken(true);
  8621. EVT VT = Op.getValueType();
  8622. SDLoc DL(Op);
  8623. unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
  8624. SDValue ReturnAddress;
  8625. if (Depth) {
  8626. SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
  8627. SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
  8628. ReturnAddress = DAG.getLoad(
  8629. VT, DL, DAG.getEntryNode(),
  8630. DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
  8631. } else {
  8632. // Return LR, which contains the return address. Mark it an implicit
  8633. // live-in.
  8634. Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
  8635. ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
  8636. }
  8637. // The XPACLRI instruction assembles to a hint-space instruction before
  8638. // Armv8.3-A therefore this instruction can be safely used for any pre
  8639. // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
  8640. // that instead.
  8641. SDNode *St;
  8642. if (Subtarget->hasPAuth()) {
  8643. St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
  8644. } else {
  8645. // XPACLRI operates on LR therefore we must move the operand accordingly.
  8646. SDValue Chain =
  8647. DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
  8648. St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
  8649. }
  8650. return SDValue(St, 0);
  8651. }
  8652. /// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
  8653. /// i32 values and take a 2 x i32 value to shift plus a shift amount.
  8654. SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
  8655. SelectionDAG &DAG) const {
  8656. SDValue Lo, Hi;
  8657. expandShiftParts(Op.getNode(), Lo, Hi, DAG);
  8658. return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
  8659. }
  8660. bool AArch64TargetLowering::isOffsetFoldingLegal(
  8661. const GlobalAddressSDNode *GA) const {
  8662. // Offsets are folded in the DAG combine rather than here so that we can
  8663. // intelligently choose an offset based on the uses.
  8664. return false;
  8665. }
  8666. bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
  8667. bool OptForSize) const {
  8668. bool IsLegal = false;
  8669. // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
  8670. // 16-bit case when target has full fp16 support.
  8671. // FIXME: We should be able to handle f128 as well with a clever lowering.
  8672. const APInt ImmInt = Imm.bitcastToAPInt();
  8673. if (VT == MVT::f64)
  8674. IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
  8675. else if (VT == MVT::f32)
  8676. IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
  8677. else if (VT == MVT::f16 && Subtarget->hasFullFP16())
  8678. IsLegal = AArch64_AM::getFP16Imm(ImmInt) != -1 || Imm.isPosZero();
  8679. // TODO: fmov h0, w0 is also legal, however on't have an isel pattern to
  8680. // generate that fmov.
  8681. // If we can not materialize in immediate field for fmov, check if the
  8682. // value can be encoded as the immediate operand of a logical instruction.
  8683. // The immediate value will be created with either MOVZ, MOVN, or ORR.
  8684. if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
  8685. // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
  8686. // however the mov+fmov sequence is always better because of the reduced
  8687. // cache pressure. The timings are still the same if you consider
  8688. // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
  8689. // movw+movk is fused). So we limit up to 2 instrdduction at most.
  8690. SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
  8691. AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(),
  8692. Insn);
  8693. unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
  8694. IsLegal = Insn.size() <= Limit;
  8695. }
  8696. LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString()
  8697. << " imm value: "; Imm.dump(););
  8698. return IsLegal;
  8699. }
  8700. //===----------------------------------------------------------------------===//
  8701. // AArch64 Optimization Hooks
  8702. //===----------------------------------------------------------------------===//
  8703. static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
  8704. SDValue Operand, SelectionDAG &DAG,
  8705. int &ExtraSteps) {
  8706. EVT VT = Operand.getValueType();
  8707. if ((ST->hasNEON() &&
  8708. (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
  8709. VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
  8710. VT == MVT::v4f32)) ||
  8711. (ST->hasSVE() &&
  8712. (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
  8713. if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
  8714. // For the reciprocal estimates, convergence is quadratic, so the number
  8715. // of digits is doubled after each iteration. In ARMv8, the accuracy of
  8716. // the initial estimate is 2^-8. Thus the number of extra steps to refine
  8717. // the result for float (23 mantissa bits) is 2 and for double (52
  8718. // mantissa bits) is 3.
  8719. ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;
  8720. return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
  8721. }
  8722. return SDValue();
  8723. }
  8724. SDValue
  8725. AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
  8726. const DenormalMode &Mode) const {
  8727. SDLoc DL(Op);
  8728. EVT VT = Op.getValueType();
  8729. EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
  8730. SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
  8731. return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
  8732. }
  8733. SDValue
  8734. AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
  8735. SelectionDAG &DAG) const {
  8736. return Op;
  8737. }
  8738. SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
  8739. SelectionDAG &DAG, int Enabled,
  8740. int &ExtraSteps,
  8741. bool &UseOneConst,
  8742. bool Reciprocal) const {
  8743. if (Enabled == ReciprocalEstimate::Enabled ||
  8744. (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
  8745. if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
  8746. DAG, ExtraSteps)) {
  8747. SDLoc DL(Operand);
  8748. EVT VT = Operand.getValueType();
  8749. SDNodeFlags Flags;
  8750. Flags.setAllowReassociation(true);
  8751. // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
  8752. // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
  8753. for (int i = ExtraSteps; i > 0; --i) {
  8754. SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
  8755. Flags);
  8756. Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
  8757. Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
  8758. }
  8759. if (!Reciprocal)
  8760. Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
  8761. ExtraSteps = 0;
  8762. return Estimate;
  8763. }
  8764. return SDValue();
  8765. }
  8766. SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
  8767. SelectionDAG &DAG, int Enabled,
  8768. int &ExtraSteps) const {
  8769. if (Enabled == ReciprocalEstimate::Enabled)
  8770. if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
  8771. DAG, ExtraSteps)) {
  8772. SDLoc DL(Operand);
  8773. EVT VT = Operand.getValueType();
  8774. SDNodeFlags Flags;
  8775. Flags.setAllowReassociation(true);
  8776. // Newton reciprocal iteration: E * (2 - X * E)
  8777. // AArch64 reciprocal iteration instruction: (2 - M * N)
  8778. for (int i = ExtraSteps; i > 0; --i) {
  8779. SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
  8780. Estimate, Flags);
  8781. Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
  8782. }
  8783. ExtraSteps = 0;
  8784. return Estimate;
  8785. }
  8786. return SDValue();
  8787. }
  8788. //===----------------------------------------------------------------------===//
  8789. // AArch64 Inline Assembly Support
  8790. //===----------------------------------------------------------------------===//
  8791. // Table of Constraints
  8792. // TODO: This is the current set of constraints supported by ARM for the
  8793. // compiler, not all of them may make sense.
  8794. //
  8795. // r - A general register
  8796. // w - An FP/SIMD register of some size in the range v0-v31
  8797. // x - An FP/SIMD register of some size in the range v0-v15
  8798. // I - Constant that can be used with an ADD instruction
  8799. // J - Constant that can be used with a SUB instruction
  8800. // K - Constant that can be used with a 32-bit logical instruction
  8801. // L - Constant that can be used with a 64-bit logical instruction
  8802. // M - Constant that can be used as a 32-bit MOV immediate
  8803. // N - Constant that can be used as a 64-bit MOV immediate
  8804. // Q - A memory reference with base register and no offset
  8805. // S - A symbolic address
  8806. // Y - Floating point constant zero
  8807. // Z - Integer constant zero
  8808. //
  8809. // Note that general register operands will be output using their 64-bit x
  8810. // register name, whatever the size of the variable, unless the asm operand
  8811. // is prefixed by the %w modifier. Floating-point and SIMD register operands
  8812. // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
  8813. // %q modifier.
  8814. const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
  8815. // At this point, we have to lower this constraint to something else, so we
  8816. // lower it to an "r" or "w". However, by doing this we will force the result
  8817. // to be in register, while the X constraint is much more permissive.
  8818. //
  8819. // Although we are correct (we are free to emit anything, without
  8820. // constraints), we might break use cases that would expect us to be more
  8821. // efficient and emit something else.
  8822. if (!Subtarget->hasFPARMv8())
  8823. return "r";
  8824. if (ConstraintVT.isFloatingPoint())
  8825. return "w";
  8826. if (ConstraintVT.isVector() &&
  8827. (ConstraintVT.getSizeInBits() == 64 ||
  8828. ConstraintVT.getSizeInBits() == 128))
  8829. return "w";
  8830. return "r";
  8831. }
  8832. enum PredicateConstraint {
  8833. Upl,
  8834. Upa,
  8835. Invalid
  8836. };
  8837. static PredicateConstraint parsePredicateConstraint(StringRef Constraint) {
  8838. PredicateConstraint P = PredicateConstraint::Invalid;
  8839. if (Constraint == "Upa")
  8840. P = PredicateConstraint::Upa;
  8841. if (Constraint == "Upl")
  8842. P = PredicateConstraint::Upl;
  8843. return P;
  8844. }
  8845. /// getConstraintType - Given a constraint letter, return the type of
  8846. /// constraint it is for this target.
  8847. AArch64TargetLowering::ConstraintType
  8848. AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
  8849. if (Constraint.size() == 1) {
  8850. switch (Constraint[0]) {
  8851. default:
  8852. break;
  8853. case 'x':
  8854. case 'w':
  8855. case 'y':
  8856. return C_RegisterClass;
  8857. // An address with a single base register. Due to the way we
  8858. // currently handle addresses it is the same as 'r'.
  8859. case 'Q':
  8860. return C_Memory;
  8861. case 'I':
  8862. case 'J':
  8863. case 'K':
  8864. case 'L':
  8865. case 'M':
  8866. case 'N':
  8867. case 'Y':
  8868. case 'Z':
  8869. return C_Immediate;
  8870. case 'z':
  8871. case 'S': // A symbolic address
  8872. return C_Other;
  8873. }
  8874. } else if (parsePredicateConstraint(Constraint) !=
  8875. PredicateConstraint::Invalid)
  8876. return C_RegisterClass;
  8877. return TargetLowering::getConstraintType(Constraint);
  8878. }
  8879. /// Examine constraint type and operand type and determine a weight value.
  8880. /// This object must already have been set up with the operand type
  8881. /// and the current alternative constraint selected.
  8882. TargetLowering::ConstraintWeight
  8883. AArch64TargetLowering::getSingleConstraintMatchWeight(
  8884. AsmOperandInfo &info, const char *constraint) const {
  8885. ConstraintWeight weight = CW_Invalid;
  8886. Value *CallOperandVal = info.CallOperandVal;
  8887. // If we don't have a value, we can't do a match,
  8888. // but allow it at the lowest weight.
  8889. if (!CallOperandVal)
  8890. return CW_Default;
  8891. Type *type = CallOperandVal->getType();
  8892. // Look at the constraint type.
  8893. switch (*constraint) {
  8894. default:
  8895. weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
  8896. break;
  8897. case 'x':
  8898. case 'w':
  8899. case 'y':
  8900. if (type->isFloatingPointTy() || type->isVectorTy())
  8901. weight = CW_Register;
  8902. break;
  8903. case 'z':
  8904. weight = CW_Constant;
  8905. break;
  8906. case 'U':
  8907. if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid)
  8908. weight = CW_Register;
  8909. break;
  8910. }
  8911. return weight;
  8912. }
  8913. std::pair<unsigned, const TargetRegisterClass *>
  8914. AArch64TargetLowering::getRegForInlineAsmConstraint(
  8915. const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
  8916. if (Constraint.size() == 1) {
  8917. switch (Constraint[0]) {
  8918. case 'r':
  8919. if (VT.isScalableVector())
  8920. return std::make_pair(0U, nullptr);
  8921. if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
  8922. return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
  8923. if (VT.getFixedSizeInBits() == 64)
  8924. return std::make_pair(0U, &AArch64::GPR64commonRegClass);
  8925. return std::make_pair(0U, &AArch64::GPR32commonRegClass);
  8926. case 'w': {
  8927. if (!Subtarget->hasFPARMv8())
  8928. break;
  8929. if (VT.isScalableVector()) {
  8930. if (VT.getVectorElementType() != MVT::i1)
  8931. return std::make_pair(0U, &AArch64::ZPRRegClass);
  8932. return std::make_pair(0U, nullptr);
  8933. }
  8934. uint64_t VTSize = VT.getFixedSizeInBits();
  8935. if (VTSize == 16)
  8936. return std::make_pair(0U, &AArch64::FPR16RegClass);
  8937. if (VTSize == 32)
  8938. return std::make_pair(0U, &AArch64::FPR32RegClass);
  8939. if (VTSize == 64)
  8940. return std::make_pair(0U, &AArch64::FPR64RegClass);
  8941. if (VTSize == 128)
  8942. return std::make_pair(0U, &AArch64::FPR128RegClass);
  8943. break;
  8944. }
  8945. // The instructions that this constraint is designed for can
  8946. // only take 128-bit registers so just use that regclass.
  8947. case 'x':
  8948. if (!Subtarget->hasFPARMv8())
  8949. break;
  8950. if (VT.isScalableVector())
  8951. return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
  8952. if (VT.getSizeInBits() == 128)
  8953. return std::make_pair(0U, &AArch64::FPR128_loRegClass);
  8954. break;
  8955. case 'y':
  8956. if (!Subtarget->hasFPARMv8())
  8957. break;
  8958. if (VT.isScalableVector())
  8959. return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
  8960. break;
  8961. }
  8962. } else {
  8963. PredicateConstraint PC = parsePredicateConstraint(Constraint);
  8964. if (PC != PredicateConstraint::Invalid) {
  8965. if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
  8966. return std::make_pair(0U, nullptr);
  8967. bool restricted = (PC == PredicateConstraint::Upl);
  8968. return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass)
  8969. : std::make_pair(0U, &AArch64::PPRRegClass);
  8970. }
  8971. }
  8972. if (StringRef("{cc}").equals_insensitive(Constraint))
  8973. return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
  8974. // Use the default implementation in TargetLowering to convert the register
  8975. // constraint into a member of a register class.
  8976. std::pair<unsigned, const TargetRegisterClass *> Res;
  8977. Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
  8978. // Not found as a standard register?
  8979. if (!Res.second) {
  8980. unsigned Size = Constraint.size();
  8981. if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
  8982. tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
  8983. int RegNo;
  8984. bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
  8985. if (!Failed && RegNo >= 0 && RegNo <= 31) {
  8986. // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
  8987. // By default we'll emit v0-v31 for this unless there's a modifier where
  8988. // we'll emit the correct register as well.
  8989. if (VT != MVT::Other && VT.getSizeInBits() == 64) {
  8990. Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
  8991. Res.second = &AArch64::FPR64RegClass;
  8992. } else {
  8993. Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
  8994. Res.second = &AArch64::FPR128RegClass;
  8995. }
  8996. }
  8997. }
  8998. }
  8999. if (Res.second && !Subtarget->hasFPARMv8() &&
  9000. !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
  9001. !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
  9002. return std::make_pair(0U, nullptr);
  9003. return Res;
  9004. }
  9005. EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL,
  9006. llvm::Type *Ty,
  9007. bool AllowUnknown) const {
  9008. if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
  9009. return EVT(MVT::i64x8);
  9010. return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
  9011. }
  9012. /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
  9013. /// vector. If it is invalid, don't add anything to Ops.
  9014. void AArch64TargetLowering::LowerAsmOperandForConstraint(
  9015. SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
  9016. SelectionDAG &DAG) const {
  9017. SDValue Result;
  9018. // Currently only support length 1 constraints.
  9019. if (Constraint.length() != 1)
  9020. return;
  9021. char ConstraintLetter = Constraint[0];
  9022. switch (ConstraintLetter) {
  9023. default:
  9024. break;
  9025. // This set of constraints deal with valid constants for various instructions.
  9026. // Validate and return a target constant for them if we can.
  9027. case 'z': {
  9028. // 'z' maps to xzr or wzr so it needs an input of 0.
  9029. if (!isNullConstant(Op))
  9030. return;
  9031. if (Op.getValueType() == MVT::i64)
  9032. Result = DAG.getRegister(AArch64::XZR, MVT::i64);
  9033. else
  9034. Result = DAG.getRegister(AArch64::WZR, MVT::i32);
  9035. break;
  9036. }
  9037. case 'S': {
  9038. // An absolute symbolic address or label reference.
  9039. if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
  9040. Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
  9041. GA->getValueType(0));
  9042. } else if (const BlockAddressSDNode *BA =
  9043. dyn_cast<BlockAddressSDNode>(Op)) {
  9044. Result =
  9045. DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0));
  9046. } else
  9047. return;
  9048. break;
  9049. }
  9050. case 'I':
  9051. case 'J':
  9052. case 'K':
  9053. case 'L':
  9054. case 'M':
  9055. case 'N':
  9056. ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
  9057. if (!C)
  9058. return;
  9059. // Grab the value and do some validation.
  9060. uint64_t CVal = C->getZExtValue();
  9061. switch (ConstraintLetter) {
  9062. // The I constraint applies only to simple ADD or SUB immediate operands:
  9063. // i.e. 0 to 4095 with optional shift by 12
  9064. // The J constraint applies only to ADD or SUB immediates that would be
  9065. // valid when negated, i.e. if [an add pattern] were to be output as a SUB
  9066. // instruction [or vice versa], in other words -1 to -4095 with optional
  9067. // left shift by 12.
  9068. case 'I':
  9069. if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
  9070. break;
  9071. return;
  9072. case 'J': {
  9073. uint64_t NVal = -C->getSExtValue();
  9074. if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
  9075. CVal = C->getSExtValue();
  9076. break;
  9077. }
  9078. return;
  9079. }
  9080. // The K and L constraints apply *only* to logical immediates, including
  9081. // what used to be the MOVI alias for ORR (though the MOVI alias has now
  9082. // been removed and MOV should be used). So these constraints have to
  9083. // distinguish between bit patterns that are valid 32-bit or 64-bit
  9084. // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
  9085. // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
  9086. // versa.
  9087. case 'K':
  9088. if (AArch64_AM::isLogicalImmediate(CVal, 32))
  9089. break;
  9090. return;
  9091. case 'L':
  9092. if (AArch64_AM::isLogicalImmediate(CVal, 64))
  9093. break;
  9094. return;
  9095. // The M and N constraints are a superset of K and L respectively, for use
  9096. // with the MOV (immediate) alias. As well as the logical immediates they
  9097. // also match 32 or 64-bit immediates that can be loaded either using a
  9098. // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
  9099. // (M) or 64-bit 0x1234000000000000 (N) etc.
  9100. // As a note some of this code is liberally stolen from the asm parser.
  9101. case 'M': {
  9102. if (!isUInt<32>(CVal))
  9103. return;
  9104. if (AArch64_AM::isLogicalImmediate(CVal, 32))
  9105. break;
  9106. if ((CVal & 0xFFFF) == CVal)
  9107. break;
  9108. if ((CVal & 0xFFFF0000ULL) == CVal)
  9109. break;
  9110. uint64_t NCVal = ~(uint32_t)CVal;
  9111. if ((NCVal & 0xFFFFULL) == NCVal)
  9112. break;
  9113. if ((NCVal & 0xFFFF0000ULL) == NCVal)
  9114. break;
  9115. return;
  9116. }
  9117. case 'N': {
  9118. if (AArch64_AM::isLogicalImmediate(CVal, 64))
  9119. break;
  9120. if ((CVal & 0xFFFFULL) == CVal)
  9121. break;
  9122. if ((CVal & 0xFFFF0000ULL) == CVal)
  9123. break;
  9124. if ((CVal & 0xFFFF00000000ULL) == CVal)
  9125. break;
  9126. if ((CVal & 0xFFFF000000000000ULL) == CVal)
  9127. break;
  9128. uint64_t NCVal = ~CVal;
  9129. if ((NCVal & 0xFFFFULL) == NCVal)
  9130. break;
  9131. if ((NCVal & 0xFFFF0000ULL) == NCVal)
  9132. break;
  9133. if ((NCVal & 0xFFFF00000000ULL) == NCVal)
  9134. break;
  9135. if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
  9136. break;
  9137. return;
  9138. }
  9139. default:
  9140. return;
  9141. }
  9142. // All assembler immediates are 64-bit integers.
  9143. Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
  9144. break;
  9145. }
  9146. if (Result.getNode()) {
  9147. Ops.push_back(Result);
  9148. return;
  9149. }
  9150. return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
  9151. }
  9152. //===----------------------------------------------------------------------===//
  9153. // AArch64 Advanced SIMD Support
  9154. //===----------------------------------------------------------------------===//
  9155. /// WidenVector - Given a value in the V64 register class, produce the
  9156. /// equivalent value in the V128 register class.
  9157. static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
  9158. EVT VT = V64Reg.getValueType();
  9159. unsigned NarrowSize = VT.getVectorNumElements();
  9160. MVT EltTy = VT.getVectorElementType().getSimpleVT();
  9161. MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
  9162. SDLoc DL(V64Reg);
  9163. return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
  9164. V64Reg, DAG.getConstant(0, DL, MVT::i64));
  9165. }
  9166. /// getExtFactor - Determine the adjustment factor for the position when
  9167. /// generating an "extract from vector registers" instruction.
  9168. static unsigned getExtFactor(SDValue &V) {
  9169. EVT EltType = V.getValueType().getVectorElementType();
  9170. return EltType.getSizeInBits() / 8;
  9171. }
  9172. /// NarrowVector - Given a value in the V128 register class, produce the
  9173. /// equivalent value in the V64 register class.
  9174. static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
  9175. EVT VT = V128Reg.getValueType();
  9176. unsigned WideSize = VT.getVectorNumElements();
  9177. MVT EltTy = VT.getVectorElementType().getSimpleVT();
  9178. MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
  9179. SDLoc DL(V128Reg);
  9180. return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
  9181. }
  9182. // Gather data to see if the operation can be modelled as a
  9183. // shuffle in combination with VEXTs.
  9184. SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
  9185. SelectionDAG &DAG) const {
  9186. assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
  9187. LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
  9188. SDLoc dl(Op);
  9189. EVT VT = Op.getValueType();
  9190. assert(!VT.isScalableVector() &&
  9191. "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
  9192. unsigned NumElts = VT.getVectorNumElements();
  9193. struct ShuffleSourceInfo {
  9194. SDValue Vec;
  9195. unsigned MinElt;
  9196. unsigned MaxElt;
  9197. // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
  9198. // be compatible with the shuffle we intend to construct. As a result
  9199. // ShuffleVec will be some sliding window into the original Vec.
  9200. SDValue ShuffleVec;
  9201. // Code should guarantee that element i in Vec starts at element "WindowBase
  9202. // + i * WindowScale in ShuffleVec".
  9203. int WindowBase;
  9204. int WindowScale;
  9205. ShuffleSourceInfo(SDValue Vec)
  9206. : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
  9207. ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
  9208. bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
  9209. };
  9210. // First gather all vectors used as an immediate source for this BUILD_VECTOR
  9211. // node.
  9212. SmallVector<ShuffleSourceInfo, 2> Sources;
  9213. for (unsigned i = 0; i < NumElts; ++i) {
  9214. SDValue V = Op.getOperand(i);
  9215. if (V.isUndef())
  9216. continue;
  9217. else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
  9218. !isa<ConstantSDNode>(V.getOperand(1)) ||
  9219. V.getOperand(0).getValueType().isScalableVector()) {
  9220. LLVM_DEBUG(
  9221. dbgs() << "Reshuffle failed: "
  9222. "a shuffle can only come from building a vector from "
  9223. "various elements of other fixed-width vectors, provided "
  9224. "their indices are constant\n");
  9225. return SDValue();
  9226. }
  9227. // Add this element source to the list if it's not already there.
  9228. SDValue SourceVec = V.getOperand(0);
  9229. auto Source = find(Sources, SourceVec);
  9230. if (Source == Sources.end())
  9231. Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
  9232. // Update the minimum and maximum lane number seen.
  9233. unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
  9234. Source->MinElt = std::min(Source->MinElt, EltNo);
  9235. Source->MaxElt = std::max(Source->MaxElt, EltNo);
  9236. }
  9237. // If we have 3 or 4 sources, try to generate a TBL, which will at least be
  9238. // better than moving to/from gpr registers for larger vectors.
  9239. if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
  9240. // Construct a mask for the tbl. We may need to adjust the index for types
  9241. // larger than i8.
  9242. SmallVector<unsigned, 16> Mask;
  9243. unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
  9244. for (unsigned I = 0; I < NumElts; ++I) {
  9245. SDValue V = Op.getOperand(I);
  9246. if (V.isUndef()) {
  9247. for (unsigned OF = 0; OF < OutputFactor; OF++)
  9248. Mask.push_back(-1);
  9249. continue;
  9250. }
  9251. // Set the Mask lanes adjusted for the size of the input and output
  9252. // lanes. The Mask is always i8, so it will set OutputFactor lanes per
  9253. // output element, adjusted in their positions per input and output types.
  9254. unsigned Lane = V.getConstantOperandVal(1);
  9255. for (unsigned S = 0; S < Sources.size(); S++) {
  9256. if (V.getOperand(0) == Sources[S].Vec) {
  9257. unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
  9258. unsigned InputBase = 16 * S + Lane * InputSize / 8;
  9259. for (unsigned OF = 0; OF < OutputFactor; OF++)
  9260. Mask.push_back(InputBase + OF);
  9261. break;
  9262. }
  9263. }
  9264. }
  9265. // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
  9266. // v16i8, and the TBLMask
  9267. SmallVector<SDValue, 16> TBLOperands;
  9268. TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
  9269. ? Intrinsic::aarch64_neon_tbl3
  9270. : Intrinsic::aarch64_neon_tbl4,
  9271. dl, MVT::i32));
  9272. for (unsigned i = 0; i < Sources.size(); i++) {
  9273. SDValue Src = Sources[i].Vec;
  9274. EVT SrcVT = Src.getValueType();
  9275. Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
  9276. assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
  9277. "Expected a legally typed vector");
  9278. if (SrcVT.is64BitVector())
  9279. Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
  9280. DAG.getUNDEF(MVT::v8i8));
  9281. TBLOperands.push_back(Src);
  9282. }
  9283. SmallVector<SDValue, 16> TBLMask;
  9284. for (unsigned i = 0; i < Mask.size(); i++)
  9285. TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
  9286. assert((Mask.size() == 8 || Mask.size() == 16) &&
  9287. "Expected a v8i8 or v16i8 Mask");
  9288. TBLOperands.push_back(
  9289. DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
  9290. SDValue Shuffle =
  9291. DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
  9292. Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
  9293. return DAG.getBitcast(VT, Shuffle);
  9294. }
  9295. if (Sources.size() > 2) {
  9296. LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
  9297. << "sensible when at most two source vectors are "
  9298. << "involved\n");
  9299. return SDValue();
  9300. }
  9301. // Find out the smallest element size among result and two sources, and use
  9302. // it as element size to build the shuffle_vector.
  9303. EVT SmallestEltTy = VT.getVectorElementType();
  9304. for (auto &Source : Sources) {
  9305. EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
  9306. if (SrcEltTy.bitsLT(SmallestEltTy)) {
  9307. SmallestEltTy = SrcEltTy;
  9308. }
  9309. }
  9310. unsigned ResMultiplier =
  9311. VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
  9312. uint64_t VTSize = VT.getFixedSizeInBits();
  9313. NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
  9314. EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
  9315. // If the source vector is too wide or too narrow, we may nevertheless be able
  9316. // to construct a compatible shuffle either by concatenating it with UNDEF or
  9317. // extracting a suitable range of elements.
  9318. for (auto &Src : Sources) {
  9319. EVT SrcVT = Src.ShuffleVec.getValueType();
  9320. TypeSize SrcVTSize = SrcVT.getSizeInBits();
  9321. if (SrcVTSize == TypeSize::Fixed(VTSize))
  9322. continue;
  9323. // This stage of the search produces a source with the same element type as
  9324. // the original, but with a total width matching the BUILD_VECTOR output.
  9325. EVT EltVT = SrcVT.getVectorElementType();
  9326. unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
  9327. EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
  9328. if (SrcVTSize.getFixedValue() < VTSize) {
  9329. assert(2 * SrcVTSize == VTSize);
  9330. // We can pad out the smaller vector for free, so if it's part of a
  9331. // shuffle...
  9332. Src.ShuffleVec =
  9333. DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
  9334. DAG.getUNDEF(Src.ShuffleVec.getValueType()));
  9335. continue;
  9336. }
  9337. if (SrcVTSize.getFixedValue() != 2 * VTSize) {
  9338. LLVM_DEBUG(
  9339. dbgs() << "Reshuffle failed: result vector too small to extract\n");
  9340. return SDValue();
  9341. }
  9342. if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
  9343. LLVM_DEBUG(
  9344. dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
  9345. return SDValue();
  9346. }
  9347. if (Src.MinElt >= NumSrcElts) {
  9348. // The extraction can just take the second half
  9349. Src.ShuffleVec =
  9350. DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
  9351. DAG.getConstant(NumSrcElts, dl, MVT::i64));
  9352. Src.WindowBase = -NumSrcElts;
  9353. } else if (Src.MaxElt < NumSrcElts) {
  9354. // The extraction can just take the first half
  9355. Src.ShuffleVec =
  9356. DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
  9357. DAG.getConstant(0, dl, MVT::i64));
  9358. } else {
  9359. // An actual VEXT is needed
  9360. SDValue VEXTSrc1 =
  9361. DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
  9362. DAG.getConstant(0, dl, MVT::i64));
  9363. SDValue VEXTSrc2 =
  9364. DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
  9365. DAG.getConstant(NumSrcElts, dl, MVT::i64));
  9366. unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
  9367. if (!SrcVT.is64BitVector()) {
  9368. LLVM_DEBUG(
  9369. dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
  9370. "for SVE vectors.");
  9371. return SDValue();
  9372. }
  9373. Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
  9374. VEXTSrc2,
  9375. DAG.getConstant(Imm, dl, MVT::i32));
  9376. Src.WindowBase = -Src.MinElt;
  9377. }
  9378. }
  9379. // Another possible incompatibility occurs from the vector element types. We
  9380. // can fix this by bitcasting the source vectors to the same type we intend
  9381. // for the shuffle.
  9382. for (auto &Src : Sources) {
  9383. EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
  9384. if (SrcEltTy == SmallestEltTy)
  9385. continue;
  9386. assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
  9387. Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
  9388. Src.WindowScale =
  9389. SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
  9390. Src.WindowBase *= Src.WindowScale;
  9391. }
  9392. // Final check before we try to actually produce a shuffle.
  9393. LLVM_DEBUG(for (auto Src
  9394. : Sources)
  9395. assert(Src.ShuffleVec.getValueType() == ShuffleVT););
  9396. // The stars all align, our next step is to produce the mask for the shuffle.
  9397. SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
  9398. int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
  9399. for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
  9400. SDValue Entry = Op.getOperand(i);
  9401. if (Entry.isUndef())
  9402. continue;
  9403. auto Src = find(Sources, Entry.getOperand(0));
  9404. int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
  9405. // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
  9406. // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
  9407. // segment.
  9408. EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
  9409. int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
  9410. VT.getScalarSizeInBits());
  9411. int LanesDefined = BitsDefined / BitsPerShuffleLane;
  9412. // This source is expected to fill ResMultiplier lanes of the final shuffle,
  9413. // starting at the appropriate offset.
  9414. int *LaneMask = &Mask[i * ResMultiplier];
  9415. int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
  9416. ExtractBase += NumElts * (Src - Sources.begin());
  9417. for (int j = 0; j < LanesDefined; ++j)
  9418. LaneMask[j] = ExtractBase + j;
  9419. }
  9420. // Final check before we try to produce nonsense...
  9421. if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
  9422. LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
  9423. return SDValue();
  9424. }
  9425. SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
  9426. for (unsigned i = 0; i < Sources.size(); ++i)
  9427. ShuffleOps[i] = Sources[i].ShuffleVec;
  9428. SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
  9429. ShuffleOps[1], Mask);
  9430. SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
  9431. LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
  9432. dbgs() << "Reshuffle, creating node: "; V.dump(););
  9433. return V;
  9434. }
  9435. // check if an EXT instruction can handle the shuffle mask when the
  9436. // vector sources of the shuffle are the same.
  9437. static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
  9438. unsigned NumElts = VT.getVectorNumElements();
  9439. // Assume that the first shuffle index is not UNDEF. Fail if it is.
  9440. if (M[0] < 0)
  9441. return false;
  9442. Imm = M[0];
  9443. // If this is a VEXT shuffle, the immediate value is the index of the first
  9444. // element. The other shuffle indices must be the successive elements after
  9445. // the first one.
  9446. unsigned ExpectedElt = Imm;
  9447. for (unsigned i = 1; i < NumElts; ++i) {
  9448. // Increment the expected index. If it wraps around, just follow it
  9449. // back to index zero and keep going.
  9450. ++ExpectedElt;
  9451. if (ExpectedElt == NumElts)
  9452. ExpectedElt = 0;
  9453. if (M[i] < 0)
  9454. continue; // ignore UNDEF indices
  9455. if (ExpectedElt != static_cast<unsigned>(M[i]))
  9456. return false;
  9457. }
  9458. return true;
  9459. }
  9460. // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
  9461. // v4i32s. This is really a truncate, which we can construct out of (legal)
  9462. // concats and truncate nodes.
  9463. static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {
  9464. if (V.getValueType() != MVT::v16i8)
  9465. return SDValue();
  9466. assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
  9467. for (unsigned X = 0; X < 4; X++) {
  9468. // Check the first item in each group is an extract from lane 0 of a v4i32
  9469. // or v4i16.
  9470. SDValue BaseExt = V.getOperand(X * 4);
  9471. if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
  9472. (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
  9473. BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
  9474. !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
  9475. BaseExt.getConstantOperandVal(1) != 0)
  9476. return SDValue();
  9477. SDValue Base = BaseExt.getOperand(0);
  9478. // And check the other items are extracts from the same vector.
  9479. for (unsigned Y = 1; Y < 4; Y++) {
  9480. SDValue Ext = V.getOperand(X * 4 + Y);
  9481. if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
  9482. Ext.getOperand(0) != Base ||
  9483. !isa<ConstantSDNode>(Ext.getOperand(1)) ||
  9484. Ext.getConstantOperandVal(1) != Y)
  9485. return SDValue();
  9486. }
  9487. }
  9488. // Turn the buildvector into a series of truncates and concates, which will
  9489. // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
  9490. // concat together to produce 2 v8i16. These are both truncated and concat
  9491. // together.
  9492. SDLoc DL(V);
  9493. SDValue Trunc[4] = {
  9494. V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
  9495. V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
  9496. for (SDValue &V : Trunc)
  9497. if (V.getValueType() == MVT::v4i32)
  9498. V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
  9499. SDValue Concat0 =
  9500. DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
  9501. SDValue Concat1 =
  9502. DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
  9503. SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
  9504. SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
  9505. return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
  9506. }
  9507. /// Check if a vector shuffle corresponds to a DUP instructions with a larger
  9508. /// element width than the vector lane type. If that is the case the function
  9509. /// returns true and writes the value of the DUP instruction lane operand into
  9510. /// DupLaneOp
  9511. static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
  9512. unsigned &DupLaneOp) {
  9513. assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
  9514. "Only possible block sizes for wide DUP are: 16, 32, 64");
  9515. if (BlockSize <= VT.getScalarSizeInBits())
  9516. return false;
  9517. if (BlockSize % VT.getScalarSizeInBits() != 0)
  9518. return false;
  9519. if (VT.getSizeInBits() % BlockSize != 0)
  9520. return false;
  9521. size_t SingleVecNumElements = VT.getVectorNumElements();
  9522. size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
  9523. size_t NumBlocks = VT.getSizeInBits() / BlockSize;
  9524. // We are looking for masks like
  9525. // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
  9526. // might be replaced by 'undefined'. BlockIndices will eventually contain
  9527. // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
  9528. // for the above examples)
  9529. SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
  9530. for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
  9531. for (size_t I = 0; I < NumEltsPerBlock; I++) {
  9532. int Elt = M[BlockIndex * NumEltsPerBlock + I];
  9533. if (Elt < 0)
  9534. continue;
  9535. // For now we don't support shuffles that use the second operand
  9536. if ((unsigned)Elt >= SingleVecNumElements)
  9537. return false;
  9538. if (BlockElts[I] < 0)
  9539. BlockElts[I] = Elt;
  9540. else if (BlockElts[I] != Elt)
  9541. return false;
  9542. }
  9543. // We found a candidate block (possibly with some undefs). It must be a
  9544. // sequence of consecutive integers starting with a value divisible by
  9545. // NumEltsPerBlock with some values possibly replaced by undef-s.
  9546. // Find first non-undef element
  9547. auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
  9548. assert(FirstRealEltIter != BlockElts.end() &&
  9549. "Shuffle with all-undefs must have been caught by previous cases, "
  9550. "e.g. isSplat()");
  9551. if (FirstRealEltIter == BlockElts.end()) {
  9552. DupLaneOp = 0;
  9553. return true;
  9554. }
  9555. // Index of FirstRealElt in BlockElts
  9556. size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
  9557. if ((unsigned)*FirstRealEltIter < FirstRealIndex)
  9558. return false;
  9559. // BlockElts[0] must have the following value if it isn't undef:
  9560. size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
  9561. // Check the first element
  9562. if (Elt0 % NumEltsPerBlock != 0)
  9563. return false;
  9564. // Check that the sequence indeed consists of consecutive integers (modulo
  9565. // undefs)
  9566. for (size_t I = 0; I < NumEltsPerBlock; I++)
  9567. if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
  9568. return false;
  9569. DupLaneOp = Elt0 / NumEltsPerBlock;
  9570. return true;
  9571. }
  9572. // check if an EXT instruction can handle the shuffle mask when the
  9573. // vector sources of the shuffle are different.
  9574. static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
  9575. unsigned &Imm) {
  9576. // Look for the first non-undef element.
  9577. const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
  9578. // Benefit form APInt to handle overflow when calculating expected element.
  9579. unsigned NumElts = VT.getVectorNumElements();
  9580. unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
  9581. APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
  9582. // The following shuffle indices must be the successive elements after the
  9583. // first real element.
  9584. bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
  9585. return Elt != ExpectedElt++ && Elt != -1;
  9586. });
  9587. if (FoundWrongElt)
  9588. return false;
  9589. // The index of an EXT is the first element if it is not UNDEF.
  9590. // Watch out for the beginning UNDEFs. The EXT index should be the expected
  9591. // value of the first element. E.g.
  9592. // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
  9593. // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
  9594. // ExpectedElt is the last mask index plus 1.
  9595. Imm = ExpectedElt.getZExtValue();
  9596. // There are two difference cases requiring to reverse input vectors.
  9597. // For example, for vector <4 x i32> we have the following cases,
  9598. // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
  9599. // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
  9600. // For both cases, we finally use mask <5, 6, 7, 0>, which requires
  9601. // to reverse two input vectors.
  9602. if (Imm < NumElts)
  9603. ReverseEXT = true;
  9604. else
  9605. Imm -= NumElts;
  9606. return true;
  9607. }
  9608. /// isREVMask - Check if a vector shuffle corresponds to a REV
  9609. /// instruction with the specified blocksize. (The order of the elements
  9610. /// within each block of the vector is reversed.)
  9611. static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
  9612. assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64 ||
  9613. BlockSize == 128) &&
  9614. "Only possible block sizes for REV are: 16, 32, 64, 128");
  9615. unsigned EltSz = VT.getScalarSizeInBits();
  9616. unsigned NumElts = VT.getVectorNumElements();
  9617. unsigned BlockElts = M[0] + 1;
  9618. // If the first shuffle index is UNDEF, be optimistic.
  9619. if (M[0] < 0)
  9620. BlockElts = BlockSize / EltSz;
  9621. if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
  9622. return false;
  9623. for (unsigned i = 0; i < NumElts; ++i) {
  9624. if (M[i] < 0)
  9625. continue; // ignore UNDEF indices
  9626. if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
  9627. return false;
  9628. }
  9629. return true;
  9630. }
  9631. static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
  9632. unsigned NumElts = VT.getVectorNumElements();
  9633. if (NumElts % 2 != 0)
  9634. return false;
  9635. WhichResult = (M[0] == 0 ? 0 : 1);
  9636. unsigned Idx = WhichResult * NumElts / 2;
  9637. for (unsigned i = 0; i != NumElts; i += 2) {
  9638. if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
  9639. (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
  9640. return false;
  9641. Idx += 1;
  9642. }
  9643. return true;
  9644. }
  9645. static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
  9646. unsigned NumElts = VT.getVectorNumElements();
  9647. WhichResult = (M[0] == 0 ? 0 : 1);
  9648. for (unsigned i = 0; i != NumElts; ++i) {
  9649. if (M[i] < 0)
  9650. continue; // ignore UNDEF indices
  9651. if ((unsigned)M[i] != 2 * i + WhichResult)
  9652. return false;
  9653. }
  9654. return true;
  9655. }
  9656. static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
  9657. unsigned NumElts = VT.getVectorNumElements();
  9658. if (NumElts % 2 != 0)
  9659. return false;
  9660. WhichResult = (M[0] == 0 ? 0 : 1);
  9661. for (unsigned i = 0; i < NumElts; i += 2) {
  9662. if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
  9663. (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
  9664. return false;
  9665. }
  9666. return true;
  9667. }
  9668. /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
  9669. /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
  9670. /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
  9671. static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
  9672. unsigned NumElts = VT.getVectorNumElements();
  9673. if (NumElts % 2 != 0)
  9674. return false;
  9675. WhichResult = (M[0] == 0 ? 0 : 1);
  9676. unsigned Idx = WhichResult * NumElts / 2;
  9677. for (unsigned i = 0; i != NumElts; i += 2) {
  9678. if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
  9679. (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
  9680. return false;
  9681. Idx += 1;
  9682. }
  9683. return true;
  9684. }
  9685. /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
  9686. /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
  9687. /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
  9688. static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
  9689. unsigned Half = VT.getVectorNumElements() / 2;
  9690. WhichResult = (M[0] == 0 ? 0 : 1);
  9691. for (unsigned j = 0; j != 2; ++j) {
  9692. unsigned Idx = WhichResult;
  9693. for (unsigned i = 0; i != Half; ++i) {
  9694. int MIdx = M[i + j * Half];
  9695. if (MIdx >= 0 && (unsigned)MIdx != Idx)
  9696. return false;
  9697. Idx += 2;
  9698. }
  9699. }
  9700. return true;
  9701. }
  9702. /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
  9703. /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
  9704. /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
  9705. static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
  9706. unsigned NumElts = VT.getVectorNumElements();
  9707. if (NumElts % 2 != 0)
  9708. return false;
  9709. WhichResult = (M[0] == 0 ? 0 : 1);
  9710. for (unsigned i = 0; i < NumElts; i += 2) {
  9711. if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
  9712. (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
  9713. return false;
  9714. }
  9715. return true;
  9716. }
  9717. static bool isINSMask(ArrayRef<int> M, int NumInputElements,
  9718. bool &DstIsLeft, int &Anomaly) {
  9719. if (M.size() != static_cast<size_t>(NumInputElements))
  9720. return false;
  9721. int NumLHSMatch = 0, NumRHSMatch = 0;
  9722. int LastLHSMismatch = -1, LastRHSMismatch = -1;
  9723. for (int i = 0; i < NumInputElements; ++i) {
  9724. if (M[i] == -1) {
  9725. ++NumLHSMatch;
  9726. ++NumRHSMatch;
  9727. continue;
  9728. }
  9729. if (M[i] == i)
  9730. ++NumLHSMatch;
  9731. else
  9732. LastLHSMismatch = i;
  9733. if (M[i] == i + NumInputElements)
  9734. ++NumRHSMatch;
  9735. else
  9736. LastRHSMismatch = i;
  9737. }
  9738. if (NumLHSMatch == NumInputElements - 1) {
  9739. DstIsLeft = true;
  9740. Anomaly = LastLHSMismatch;
  9741. return true;
  9742. } else if (NumRHSMatch == NumInputElements - 1) {
  9743. DstIsLeft = false;
  9744. Anomaly = LastRHSMismatch;
  9745. return true;
  9746. }
  9747. return false;
  9748. }
  9749. static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
  9750. if (VT.getSizeInBits() != 128)
  9751. return false;
  9752. unsigned NumElts = VT.getVectorNumElements();
  9753. for (int I = 0, E = NumElts / 2; I != E; I++) {
  9754. if (Mask[I] != I)
  9755. return false;
  9756. }
  9757. int Offset = NumElts / 2;
  9758. for (int I = NumElts / 2, E = NumElts; I != E; I++) {
  9759. if (Mask[I] != I + SplitLHS * Offset)
  9760. return false;
  9761. }
  9762. return true;
  9763. }
  9764. static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
  9765. SDLoc DL(Op);
  9766. EVT VT = Op.getValueType();
  9767. SDValue V0 = Op.getOperand(0);
  9768. SDValue V1 = Op.getOperand(1);
  9769. ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
  9770. if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
  9771. VT.getVectorElementType() != V1.getValueType().getVectorElementType())
  9772. return SDValue();
  9773. bool SplitV0 = V0.getValueSizeInBits() == 128;
  9774. if (!isConcatMask(Mask, VT, SplitV0))
  9775. return SDValue();
  9776. EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
  9777. if (SplitV0) {
  9778. V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
  9779. DAG.getConstant(0, DL, MVT::i64));
  9780. }
  9781. if (V1.getValueSizeInBits() == 128) {
  9782. V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
  9783. DAG.getConstant(0, DL, MVT::i64));
  9784. }
  9785. return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
  9786. }
  9787. /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
  9788. /// the specified operations to build the shuffle. ID is the perfect-shuffle
  9789. //ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
  9790. //table entry and LHS/RHS are the immediate inputs for this stage of the
  9791. //shuffle.
  9792. static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1,
  9793. SDValue V2, unsigned PFEntry, SDValue LHS,
  9794. SDValue RHS, SelectionDAG &DAG,
  9795. const SDLoc &dl) {
  9796. unsigned OpNum = (PFEntry >> 26) & 0x0F;
  9797. unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
  9798. unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
  9799. enum {
  9800. OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
  9801. OP_VREV,
  9802. OP_VDUP0,
  9803. OP_VDUP1,
  9804. OP_VDUP2,
  9805. OP_VDUP3,
  9806. OP_VEXT1,
  9807. OP_VEXT2,
  9808. OP_VEXT3,
  9809. OP_VUZPL, // VUZP, left result
  9810. OP_VUZPR, // VUZP, right result
  9811. OP_VZIPL, // VZIP, left result
  9812. OP_VZIPR, // VZIP, right result
  9813. OP_VTRNL, // VTRN, left result
  9814. OP_VTRNR, // VTRN, right result
  9815. OP_MOVLANE // Move lane. RHSID is the lane to move into
  9816. };
  9817. if (OpNum == OP_COPY) {
  9818. if (LHSID == (1 * 9 + 2) * 9 + 3)
  9819. return LHS;
  9820. assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
  9821. return RHS;
  9822. }
  9823. if (OpNum == OP_MOVLANE) {
  9824. // Decompose a PerfectShuffle ID to get the Mask for lane Elt
  9825. auto getPFIDLane = [](unsigned ID, int Elt) -> int {
  9826. assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
  9827. Elt = 3 - Elt;
  9828. while (Elt > 0) {
  9829. ID /= 9;
  9830. Elt--;
  9831. }
  9832. return (ID % 9 == 8) ? -1 : ID % 9;
  9833. };
  9834. // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
  9835. // get the lane to move from from the PFID, which is always from the
  9836. // original vectors (V1 or V2).
  9837. SDValue OpLHS = GeneratePerfectShuffle(
  9838. LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
  9839. EVT VT = OpLHS.getValueType();
  9840. assert(RHSID < 8 && "Expected a lane index for RHSID!");
  9841. unsigned ExtLane = 0;
  9842. SDValue Input;
  9843. // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
  9844. // convert into a higher type.
  9845. if (RHSID & 0x4) {
  9846. int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
  9847. if (MaskElt == -1)
  9848. MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
  9849. assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
  9850. ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
  9851. Input = MaskElt < 2 ? V1 : V2;
  9852. if (VT.getScalarSizeInBits() == 16) {
  9853. Input = DAG.getBitcast(MVT::v2f32, Input);
  9854. OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
  9855. } else {
  9856. assert(VT.getScalarSizeInBits() == 32 &&
  9857. "Expected 16 or 32 bit shuffle elemements");
  9858. Input = DAG.getBitcast(MVT::v2f64, Input);
  9859. OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
  9860. }
  9861. } else {
  9862. int MaskElt = getPFIDLane(ID, RHSID);
  9863. assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
  9864. ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
  9865. Input = MaskElt < 4 ? V1 : V2;
  9866. // Be careful about creating illegal types. Use f16 instead of i16.
  9867. if (VT == MVT::v4i16) {
  9868. Input = DAG.getBitcast(MVT::v4f16, Input);
  9869. OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
  9870. }
  9871. }
  9872. SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
  9873. Input.getValueType().getVectorElementType(),
  9874. Input, DAG.getVectorIdxConstant(ExtLane, dl));
  9875. SDValue Ins =
  9876. DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
  9877. Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
  9878. return DAG.getBitcast(VT, Ins);
  9879. }
  9880. SDValue OpLHS, OpRHS;
  9881. OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
  9882. RHS, DAG, dl);
  9883. OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
  9884. RHS, DAG, dl);
  9885. EVT VT = OpLHS.getValueType();
  9886. switch (OpNum) {
  9887. default:
  9888. llvm_unreachable("Unknown shuffle opcode!");
  9889. case OP_VREV:
  9890. // VREV divides the vector in half and swaps within the half.
  9891. if (VT.getVectorElementType() == MVT::i32 ||
  9892. VT.getVectorElementType() == MVT::f32)
  9893. return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
  9894. // vrev <4 x i16> -> REV32
  9895. if (VT.getVectorElementType() == MVT::i16 ||
  9896. VT.getVectorElementType() == MVT::f16 ||
  9897. VT.getVectorElementType() == MVT::bf16)
  9898. return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
  9899. // vrev <4 x i8> -> REV16
  9900. assert(VT.getVectorElementType() == MVT::i8);
  9901. return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
  9902. case OP_VDUP0:
  9903. case OP_VDUP1:
  9904. case OP_VDUP2:
  9905. case OP_VDUP3: {
  9906. EVT EltTy = VT.getVectorElementType();
  9907. unsigned Opcode;
  9908. if (EltTy == MVT::i8)
  9909. Opcode = AArch64ISD::DUPLANE8;
  9910. else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
  9911. Opcode = AArch64ISD::DUPLANE16;
  9912. else if (EltTy == MVT::i32 || EltTy == MVT::f32)
  9913. Opcode = AArch64ISD::DUPLANE32;
  9914. else if (EltTy == MVT::i64 || EltTy == MVT::f64)
  9915. Opcode = AArch64ISD::DUPLANE64;
  9916. else
  9917. llvm_unreachable("Invalid vector element type?");
  9918. if (VT.getSizeInBits() == 64)
  9919. OpLHS = WidenVector(OpLHS, DAG);
  9920. SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
  9921. return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
  9922. }
  9923. case OP_VEXT1:
  9924. case OP_VEXT2:
  9925. case OP_VEXT3: {
  9926. unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
  9927. return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
  9928. DAG.getConstant(Imm, dl, MVT::i32));
  9929. }
  9930. case OP_VUZPL:
  9931. return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
  9932. OpRHS);
  9933. case OP_VUZPR:
  9934. return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
  9935. OpRHS);
  9936. case OP_VZIPL:
  9937. return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
  9938. OpRHS);
  9939. case OP_VZIPR:
  9940. return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
  9941. OpRHS);
  9942. case OP_VTRNL:
  9943. return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
  9944. OpRHS);
  9945. case OP_VTRNR:
  9946. return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
  9947. OpRHS);
  9948. }
  9949. }
  9950. static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
  9951. SelectionDAG &DAG) {
  9952. // Check to see if we can use the TBL instruction.
  9953. SDValue V1 = Op.getOperand(0);
  9954. SDValue V2 = Op.getOperand(1);
  9955. SDLoc DL(Op);
  9956. EVT EltVT = Op.getValueType().getVectorElementType();
  9957. unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
  9958. bool Swap = false;
  9959. if (V1.isUndef() || isZerosVector(V1.getNode())) {
  9960. std::swap(V1, V2);
  9961. Swap = true;
  9962. }
  9963. // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
  9964. // out of range values with 0s. We do need to make sure that any out-of-range
  9965. // values are really out-of-range for a v16i8 vector.
  9966. bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
  9967. MVT IndexVT = MVT::v8i8;
  9968. unsigned IndexLen = 8;
  9969. if (Op.getValueSizeInBits() == 128) {
  9970. IndexVT = MVT::v16i8;
  9971. IndexLen = 16;
  9972. }
  9973. SmallVector<SDValue, 8> TBLMask;
  9974. for (int Val : ShuffleMask) {
  9975. for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
  9976. unsigned Offset = Byte + Val * BytesPerElt;
  9977. if (Swap)
  9978. Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
  9979. if (IsUndefOrZero && Offset >= IndexLen)
  9980. Offset = 255;
  9981. TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
  9982. }
  9983. }
  9984. SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
  9985. SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
  9986. SDValue Shuffle;
  9987. if (IsUndefOrZero) {
  9988. if (IndexLen == 8)
  9989. V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
  9990. Shuffle = DAG.getNode(
  9991. ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
  9992. DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
  9993. DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
  9994. } else {
  9995. if (IndexLen == 8) {
  9996. V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
  9997. Shuffle = DAG.getNode(
  9998. ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
  9999. DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
  10000. DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
  10001. } else {
  10002. // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
  10003. // cannot currently represent the register constraints on the input
  10004. // table registers.
  10005. // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
  10006. // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
  10007. // IndexLen));
  10008. Shuffle = DAG.getNode(
  10009. ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
  10010. DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
  10011. V2Cst,
  10012. DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
  10013. }
  10014. }
  10015. return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
  10016. }
  10017. static unsigned getDUPLANEOp(EVT EltType) {
  10018. if (EltType == MVT::i8)
  10019. return AArch64ISD::DUPLANE8;
  10020. if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
  10021. return AArch64ISD::DUPLANE16;
  10022. if (EltType == MVT::i32 || EltType == MVT::f32)
  10023. return AArch64ISD::DUPLANE32;
  10024. if (EltType == MVT::i64 || EltType == MVT::f64)
  10025. return AArch64ISD::DUPLANE64;
  10026. llvm_unreachable("Invalid vector element type?");
  10027. }
  10028. static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
  10029. unsigned Opcode, SelectionDAG &DAG) {
  10030. // Try to eliminate a bitcasted extract subvector before a DUPLANE.
  10031. auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
  10032. // Match: dup (bitcast (extract_subv X, C)), LaneC
  10033. if (BitCast.getOpcode() != ISD::BITCAST ||
  10034. BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
  10035. return false;
  10036. // The extract index must align in the destination type. That may not
  10037. // happen if the bitcast is from narrow to wide type.
  10038. SDValue Extract = BitCast.getOperand(0);
  10039. unsigned ExtIdx = Extract.getConstantOperandVal(1);
  10040. unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
  10041. unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
  10042. unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
  10043. if (ExtIdxInBits % CastedEltBitWidth != 0)
  10044. return false;
  10045. // Can't handle cases where vector size is not 128-bit
  10046. if (!Extract.getOperand(0).getValueType().is128BitVector())
  10047. return false;
  10048. // Update the lane value by offsetting with the scaled extract index.
  10049. LaneC += ExtIdxInBits / CastedEltBitWidth;
  10050. // Determine the casted vector type of the wide vector input.
  10051. // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
  10052. // Examples:
  10053. // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
  10054. // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
  10055. unsigned SrcVecNumElts =
  10056. Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
  10057. CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
  10058. SrcVecNumElts);
  10059. return true;
  10060. };
  10061. MVT CastVT;
  10062. if (getScaledOffsetDup(V, Lane, CastVT)) {
  10063. V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
  10064. } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
  10065. V.getOperand(0).getValueType().is128BitVector()) {
  10066. // The lane is incremented by the index of the extract.
  10067. // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
  10068. Lane += V.getConstantOperandVal(1);
  10069. V = V.getOperand(0);
  10070. } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
  10071. // The lane is decremented if we are splatting from the 2nd operand.
  10072. // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
  10073. unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
  10074. Lane -= Idx * VT.getVectorNumElements() / 2;
  10075. V = WidenVector(V.getOperand(Idx), DAG);
  10076. } else if (VT.getSizeInBits() == 64) {
  10077. // Widen the operand to 128-bit register with undef.
  10078. V = WidenVector(V, DAG);
  10079. }
  10080. return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
  10081. }
  10082. // Return true if we can get a new shuffle mask by checking the parameter mask
  10083. // array to test whether every two adjacent mask values are continuous and
  10084. // starting from an even number.
  10085. static bool isWideTypeMask(ArrayRef<int> M, EVT VT,
  10086. SmallVectorImpl<int> &NewMask) {
  10087. unsigned NumElts = VT.getVectorNumElements();
  10088. if (NumElts % 2 != 0)
  10089. return false;
  10090. NewMask.clear();
  10091. for (unsigned i = 0; i < NumElts; i += 2) {
  10092. int M0 = M[i];
  10093. int M1 = M[i + 1];
  10094. // If both elements are undef, new mask is undef too.
  10095. if (M0 == -1 && M1 == -1) {
  10096. NewMask.push_back(-1);
  10097. continue;
  10098. }
  10099. if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
  10100. NewMask.push_back(M1 / 2);
  10101. continue;
  10102. }
  10103. if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
  10104. NewMask.push_back(M0 / 2);
  10105. continue;
  10106. }
  10107. NewMask.clear();
  10108. return false;
  10109. }
  10110. assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
  10111. return true;
  10112. }
  10113. // Try to widen element type to get a new mask value for a better permutation
  10114. // sequence, so that we can use NEON shuffle instructions, such as zip1/2,
  10115. // UZP1/2, TRN1/2, REV, INS, etc.
  10116. // For example:
  10117. // shufflevector <4 x i32> %a, <4 x i32> %b,
  10118. // <4 x i32> <i32 6, i32 7, i32 2, i32 3>
  10119. // is equivalent to:
  10120. // shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
  10121. // Finally, we can get:
  10122. // mov v0.d[0], v1.d[1]
  10123. static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) {
  10124. SDLoc DL(Op);
  10125. EVT VT = Op.getValueType();
  10126. EVT ScalarVT = VT.getVectorElementType();
  10127. unsigned ElementSize = ScalarVT.getFixedSizeInBits();
  10128. SDValue V0 = Op.getOperand(0);
  10129. SDValue V1 = Op.getOperand(1);
  10130. ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
  10131. // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
  10132. // We need to make sure the wider element type is legal. Thus, ElementSize
  10133. // should be not larger than 32 bits, and i1 type should also be excluded.
  10134. if (ElementSize > 32 || ElementSize == 1)
  10135. return SDValue();
  10136. SmallVector<int, 8> NewMask;
  10137. if (isWideTypeMask(Mask, VT, NewMask)) {
  10138. MVT NewEltVT = VT.isFloatingPoint()
  10139. ? MVT::getFloatingPointVT(ElementSize * 2)
  10140. : MVT::getIntegerVT(ElementSize * 2);
  10141. MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
  10142. if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
  10143. V0 = DAG.getBitcast(NewVT, V0);
  10144. V1 = DAG.getBitcast(NewVT, V1);
  10145. return DAG.getBitcast(VT,
  10146. DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
  10147. }
  10148. }
  10149. return SDValue();
  10150. }
  10151. // Try to fold shuffle (tbl2, tbl2) into a single tbl4.
  10152. static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op,
  10153. ArrayRef<int> ShuffleMask,
  10154. SelectionDAG &DAG) {
  10155. SDValue Tbl1 = Op->getOperand(0);
  10156. SDValue Tbl2 = Op->getOperand(1);
  10157. SDLoc dl(Op);
  10158. SDValue Tbl2ID =
  10159. DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
  10160. EVT VT = Op.getValueType();
  10161. if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
  10162. Tbl1->getOperand(0) != Tbl2ID ||
  10163. Tbl2->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
  10164. Tbl2->getOperand(0) != Tbl2ID)
  10165. return SDValue();
  10166. if (Tbl1->getValueType(0) != MVT::v16i8 ||
  10167. Tbl2->getValueType(0) != MVT::v16i8)
  10168. return SDValue();
  10169. SDValue Mask1 = Tbl1->getOperand(3);
  10170. SDValue Mask2 = Tbl2->getOperand(3);
  10171. SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
  10172. for (unsigned I = 0; I < 16; I++) {
  10173. if (ShuffleMask[I] < 16)
  10174. TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]);
  10175. else {
  10176. auto *C =
  10177. dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16));
  10178. if (!C)
  10179. return SDValue();
  10180. TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
  10181. }
  10182. }
  10183. SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts);
  10184. SDValue ID =
  10185. DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
  10186. return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
  10187. {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
  10188. Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
  10189. }
  10190. // Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
  10191. // but we don't have an appropriate instruction,
  10192. // so custom-lower it as ZIP1-with-zeros.
  10193. SDValue
  10194. AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
  10195. SelectionDAG &DAG) const {
  10196. SDLoc dl(Op);
  10197. EVT VT = Op.getValueType();
  10198. SDValue SrcOp = Op.getOperand(0);
  10199. EVT SrcVT = SrcOp.getValueType();
  10200. assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
  10201. "Unexpected extension factor.");
  10202. unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
  10203. // FIXME: support multi-step zipping?
  10204. if (Scale != 2)
  10205. return SDValue();
  10206. SDValue Zeros = DAG.getConstant(0, dl, SrcVT);
  10207. return DAG.getBitcast(VT,
  10208. DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros));
  10209. }
  10210. SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
  10211. SelectionDAG &DAG) const {
  10212. SDLoc dl(Op);
  10213. EVT VT = Op.getValueType();
  10214. ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
  10215. if (useSVEForFixedLengthVectorVT(VT,
  10216. Subtarget->forceStreamingCompatibleSVE()))
  10217. return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
  10218. // Convert shuffles that are directly supported on NEON to target-specific
  10219. // DAG nodes, instead of keeping them as shuffles and matching them again
  10220. // during code selection. This is more efficient and avoids the possibility
  10221. // of inconsistencies between legalization and selection.
  10222. ArrayRef<int> ShuffleMask = SVN->getMask();
  10223. SDValue V1 = Op.getOperand(0);
  10224. SDValue V2 = Op.getOperand(1);
  10225. assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
  10226. assert(ShuffleMask.size() == VT.getVectorNumElements() &&
  10227. "Unexpected VECTOR_SHUFFLE mask size!");
  10228. if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
  10229. return Res;
  10230. if (SVN->isSplat()) {
  10231. int Lane = SVN->getSplatIndex();
  10232. // If this is undef splat, generate it via "just" vdup, if possible.
  10233. if (Lane == -1)
  10234. Lane = 0;
  10235. if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
  10236. return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
  10237. V1.getOperand(0));
  10238. // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
  10239. // constant. If so, we can just reference the lane's definition directly.
  10240. if (V1.getOpcode() == ISD::BUILD_VECTOR &&
  10241. !isa<ConstantSDNode>(V1.getOperand(Lane)))
  10242. return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
  10243. // Otherwise, duplicate from the lane of the input vector.
  10244. unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
  10245. return constructDup(V1, Lane, dl, VT, Opcode, DAG);
  10246. }
  10247. // Check if the mask matches a DUP for a wider element
  10248. for (unsigned LaneSize : {64U, 32U, 16U}) {
  10249. unsigned Lane = 0;
  10250. if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
  10251. unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
  10252. : LaneSize == 32 ? AArch64ISD::DUPLANE32
  10253. : AArch64ISD::DUPLANE16;
  10254. // Cast V1 to an integer vector with required lane size
  10255. MVT NewEltTy = MVT::getIntegerVT(LaneSize);
  10256. unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
  10257. MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
  10258. V1 = DAG.getBitcast(NewVecTy, V1);
  10259. // Constuct the DUP instruction
  10260. V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
  10261. // Cast back to the original type
  10262. return DAG.getBitcast(VT, V1);
  10263. }
  10264. }
  10265. if (isREVMask(ShuffleMask, VT, 64))
  10266. return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
  10267. if (isREVMask(ShuffleMask, VT, 32))
  10268. return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
  10269. if (isREVMask(ShuffleMask, VT, 16))
  10270. return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
  10271. if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) ||
  10272. (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) &&
  10273. ShuffleVectorInst::isReverseMask(ShuffleMask)) {
  10274. SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
  10275. return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
  10276. DAG.getConstant(8, dl, MVT::i32));
  10277. }
  10278. bool ReverseEXT = false;
  10279. unsigned Imm;
  10280. if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
  10281. if (ReverseEXT)
  10282. std::swap(V1, V2);
  10283. Imm *= getExtFactor(V1);
  10284. return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
  10285. DAG.getConstant(Imm, dl, MVT::i32));
  10286. } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
  10287. Imm *= getExtFactor(V1);
  10288. return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
  10289. DAG.getConstant(Imm, dl, MVT::i32));
  10290. }
  10291. unsigned WhichResult;
  10292. if (isZIPMask(ShuffleMask, VT, WhichResult)) {
  10293. unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
  10294. return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
  10295. }
  10296. if (isUZPMask(ShuffleMask, VT, WhichResult)) {
  10297. unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
  10298. return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
  10299. }
  10300. if (isTRNMask(ShuffleMask, VT, WhichResult)) {
  10301. unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
  10302. return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
  10303. }
  10304. if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
  10305. unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
  10306. return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
  10307. }
  10308. if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
  10309. unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
  10310. return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
  10311. }
  10312. if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
  10313. unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
  10314. return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
  10315. }
  10316. if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
  10317. return Concat;
  10318. bool DstIsLeft;
  10319. int Anomaly;
  10320. int NumInputElements = V1.getValueType().getVectorNumElements();
  10321. if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
  10322. SDValue DstVec = DstIsLeft ? V1 : V2;
  10323. SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
  10324. SDValue SrcVec = V1;
  10325. int SrcLane = ShuffleMask[Anomaly];
  10326. if (SrcLane >= NumInputElements) {
  10327. SrcVec = V2;
  10328. SrcLane -= VT.getVectorNumElements();
  10329. }
  10330. SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
  10331. EVT ScalarVT = VT.getVectorElementType();
  10332. if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
  10333. ScalarVT = MVT::i32;
  10334. return DAG.getNode(
  10335. ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
  10336. DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
  10337. DstLaneV);
  10338. }
  10339. if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
  10340. return NewSD;
  10341. // If the shuffle is not directly supported and it has 4 elements, use
  10342. // the PerfectShuffle-generated table to synthesize it from other shuffles.
  10343. unsigned NumElts = VT.getVectorNumElements();
  10344. if (NumElts == 4) {
  10345. unsigned PFIndexes[4];
  10346. for (unsigned i = 0; i != 4; ++i) {
  10347. if (ShuffleMask[i] < 0)
  10348. PFIndexes[i] = 8;
  10349. else
  10350. PFIndexes[i] = ShuffleMask[i];
  10351. }
  10352. // Compute the index in the perfect shuffle table.
  10353. unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
  10354. PFIndexes[2] * 9 + PFIndexes[3];
  10355. unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
  10356. return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
  10357. dl);
  10358. }
  10359. return GenerateTBL(Op, ShuffleMask, DAG);
  10360. }
  10361. SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
  10362. SelectionDAG &DAG) const {
  10363. EVT VT = Op.getValueType();
  10364. if (useSVEForFixedLengthVectorVT(VT,
  10365. Subtarget->forceStreamingCompatibleSVE()))
  10366. return LowerToScalableOp(Op, DAG);
  10367. assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
  10368. "Unexpected vector type!");
  10369. // We can handle the constant cases during isel.
  10370. if (isa<ConstantSDNode>(Op.getOperand(0)))
  10371. return Op;
  10372. // There isn't a natural way to handle the general i1 case, so we use some
  10373. // trickery with whilelo.
  10374. SDLoc DL(Op);
  10375. SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
  10376. SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
  10377. DAG.getValueType(MVT::i1));
  10378. SDValue ID =
  10379. DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
  10380. SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
  10381. if (VT == MVT::nxv1i1)
  10382. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
  10383. DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
  10384. Zero, SplatVal),
  10385. Zero);
  10386. return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
  10387. }
  10388. SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
  10389. SelectionDAG &DAG) const {
  10390. SDLoc DL(Op);
  10391. EVT VT = Op.getValueType();
  10392. if (!isTypeLegal(VT) || !VT.isScalableVector())
  10393. return SDValue();
  10394. // Current lowering only supports the SVE-ACLE types.
  10395. if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
  10396. return SDValue();
  10397. // The DUPQ operation is indepedent of element type so normalise to i64s.
  10398. SDValue Idx128 = Op.getOperand(2);
  10399. // DUPQ can be used when idx is in range.
  10400. auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
  10401. if (CIdx && (CIdx->getZExtValue() <= 3)) {
  10402. SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
  10403. return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
  10404. }
  10405. SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
  10406. // The ACLE says this must produce the same result as:
  10407. // svtbl(data, svadd_x(svptrue_b64(),
  10408. // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
  10409. // index * 2))
  10410. SDValue One = DAG.getConstant(1, DL, MVT::i64);
  10411. SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
  10412. // create the vector 0,1,0,1,...
  10413. SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
  10414. SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
  10415. // create the vector idx64,idx64+1,idx64,idx64+1,...
  10416. SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
  10417. SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
  10418. SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
  10419. // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
  10420. SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
  10421. return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
  10422. }
  10423. static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
  10424. APInt &UndefBits) {
  10425. EVT VT = BVN->getValueType(0);
  10426. APInt SplatBits, SplatUndef;
  10427. unsigned SplatBitSize;
  10428. bool HasAnyUndefs;
  10429. if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
  10430. unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
  10431. for (unsigned i = 0; i < NumSplats; ++i) {
  10432. CnstBits <<= SplatBitSize;
  10433. UndefBits <<= SplatBitSize;
  10434. CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
  10435. UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
  10436. }
  10437. return true;
  10438. }
  10439. return false;
  10440. }
  10441. // Try 64-bit splatted SIMD immediate.
  10442. static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
  10443. const APInt &Bits) {
  10444. if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
  10445. uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
  10446. EVT VT = Op.getValueType();
  10447. MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
  10448. if (AArch64_AM::isAdvSIMDModImmType10(Value)) {
  10449. Value = AArch64_AM::encodeAdvSIMDModImmType10(Value);
  10450. SDLoc dl(Op);
  10451. SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
  10452. DAG.getConstant(Value, dl, MVT::i32));
  10453. return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
  10454. }
  10455. }
  10456. return SDValue();
  10457. }
  10458. // Try 32-bit splatted SIMD immediate.
  10459. static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
  10460. const APInt &Bits,
  10461. const SDValue *LHS = nullptr) {
  10462. EVT VT = Op.getValueType();
  10463. if (VT.isFixedLengthVector() &&
  10464. DAG.getSubtarget<AArch64Subtarget>().forceStreamingCompatibleSVE())
  10465. return SDValue();
  10466. if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
  10467. uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
  10468. MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
  10469. bool isAdvSIMDModImm = false;
  10470. uint64_t Shift;
  10471. if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
  10472. Value = AArch64_AM::encodeAdvSIMDModImmType1(Value);
  10473. Shift = 0;
  10474. }
  10475. else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
  10476. Value = AArch64_AM::encodeAdvSIMDModImmType2(Value);
  10477. Shift = 8;
  10478. }
  10479. else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
  10480. Value = AArch64_AM::encodeAdvSIMDModImmType3(Value);
  10481. Shift = 16;
  10482. }
  10483. else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
  10484. Value = AArch64_AM::encodeAdvSIMDModImmType4(Value);
  10485. Shift = 24;
  10486. }
  10487. if (isAdvSIMDModImm) {
  10488. SDLoc dl(Op);
  10489. SDValue Mov;
  10490. if (LHS)
  10491. Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
  10492. DAG.getConstant(Value, dl, MVT::i32),
  10493. DAG.getConstant(Shift, dl, MVT::i32));
  10494. else
  10495. Mov = DAG.getNode(NewOp, dl, MovTy,
  10496. DAG.getConstant(Value, dl, MVT::i32),
  10497. DAG.getConstant(Shift, dl, MVT::i32));
  10498. return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
  10499. }
  10500. }
  10501. return SDValue();
  10502. }
  10503. // Try 16-bit splatted SIMD immediate.
  10504. static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
  10505. const APInt &Bits,
  10506. const SDValue *LHS = nullptr) {
  10507. EVT VT = Op.getValueType();
  10508. if (VT.isFixedLengthVector() &&
  10509. DAG.getSubtarget<AArch64Subtarget>().forceStreamingCompatibleSVE())
  10510. return SDValue();
  10511. if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
  10512. uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
  10513. MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
  10514. bool isAdvSIMDModImm = false;
  10515. uint64_t Shift;
  10516. if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
  10517. Value = AArch64_AM::encodeAdvSIMDModImmType5(Value);
  10518. Shift = 0;
  10519. }
  10520. else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
  10521. Value = AArch64_AM::encodeAdvSIMDModImmType6(Value);
  10522. Shift = 8;
  10523. }
  10524. if (isAdvSIMDModImm) {
  10525. SDLoc dl(Op);
  10526. SDValue Mov;
  10527. if (LHS)
  10528. Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
  10529. DAG.getConstant(Value, dl, MVT::i32),
  10530. DAG.getConstant(Shift, dl, MVT::i32));
  10531. else
  10532. Mov = DAG.getNode(NewOp, dl, MovTy,
  10533. DAG.getConstant(Value, dl, MVT::i32),
  10534. DAG.getConstant(Shift, dl, MVT::i32));
  10535. return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
  10536. }
  10537. }
  10538. return SDValue();
  10539. }
  10540. // Try 32-bit splatted SIMD immediate with shifted ones.
  10541. static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
  10542. SelectionDAG &DAG, const APInt &Bits) {
  10543. if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
  10544. uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
  10545. EVT VT = Op.getValueType();
  10546. MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
  10547. bool isAdvSIMDModImm = false;
  10548. uint64_t Shift;
  10549. if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
  10550. Value = AArch64_AM::encodeAdvSIMDModImmType7(Value);
  10551. Shift = 264;
  10552. }
  10553. else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
  10554. Value = AArch64_AM::encodeAdvSIMDModImmType8(Value);
  10555. Shift = 272;
  10556. }
  10557. if (isAdvSIMDModImm) {
  10558. SDLoc dl(Op);
  10559. SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
  10560. DAG.getConstant(Value, dl, MVT::i32),
  10561. DAG.getConstant(Shift, dl, MVT::i32));
  10562. return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
  10563. }
  10564. }
  10565. return SDValue();
  10566. }
  10567. // Try 8-bit splatted SIMD immediate.
  10568. static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
  10569. const APInt &Bits) {
  10570. if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
  10571. uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
  10572. EVT VT = Op.getValueType();
  10573. MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
  10574. if (AArch64_AM::isAdvSIMDModImmType9(Value)) {
  10575. Value = AArch64_AM::encodeAdvSIMDModImmType9(Value);
  10576. SDLoc dl(Op);
  10577. SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
  10578. DAG.getConstant(Value, dl, MVT::i32));
  10579. return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
  10580. }
  10581. }
  10582. return SDValue();
  10583. }
  10584. // Try FP splatted SIMD immediate.
  10585. static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
  10586. const APInt &Bits) {
  10587. if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
  10588. uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
  10589. EVT VT = Op.getValueType();
  10590. bool isWide = (VT.getSizeInBits() == 128);
  10591. MVT MovTy;
  10592. bool isAdvSIMDModImm = false;
  10593. if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
  10594. Value = AArch64_AM::encodeAdvSIMDModImmType11(Value);
  10595. MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
  10596. }
  10597. else if (isWide &&
  10598. (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
  10599. Value = AArch64_AM::encodeAdvSIMDModImmType12(Value);
  10600. MovTy = MVT::v2f64;
  10601. }
  10602. if (isAdvSIMDModImm) {
  10603. SDLoc dl(Op);
  10604. SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
  10605. DAG.getConstant(Value, dl, MVT::i32));
  10606. return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
  10607. }
  10608. }
  10609. return SDValue();
  10610. }
  10611. // Specialized code to quickly find if PotentialBVec is a BuildVector that
  10612. // consists of only the same constant int value, returned in reference arg
  10613. // ConstVal
  10614. static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
  10615. uint64_t &ConstVal) {
  10616. BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
  10617. if (!Bvec)
  10618. return false;
  10619. ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
  10620. if (!FirstElt)
  10621. return false;
  10622. EVT VT = Bvec->getValueType(0);
  10623. unsigned NumElts = VT.getVectorNumElements();
  10624. for (unsigned i = 1; i < NumElts; ++i)
  10625. if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
  10626. return false;
  10627. ConstVal = FirstElt->getZExtValue();
  10628. return true;
  10629. }
  10630. // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
  10631. // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
  10632. // BUILD_VECTORs with constant element C1, C2 is a constant, and:
  10633. // - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
  10634. // - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
  10635. // The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
  10636. static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
  10637. EVT VT = N->getValueType(0);
  10638. if (!VT.isVector())
  10639. return SDValue();
  10640. SDLoc DL(N);
  10641. SDValue And;
  10642. SDValue Shift;
  10643. SDValue FirstOp = N->getOperand(0);
  10644. unsigned FirstOpc = FirstOp.getOpcode();
  10645. SDValue SecondOp = N->getOperand(1);
  10646. unsigned SecondOpc = SecondOp.getOpcode();
  10647. // Is one of the operands an AND or a BICi? The AND may have been optimised to
  10648. // a BICi in order to use an immediate instead of a register.
  10649. // Is the other operand an shl or lshr? This will have been turned into:
  10650. // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift.
  10651. if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
  10652. (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) {
  10653. And = FirstOp;
  10654. Shift = SecondOp;
  10655. } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
  10656. (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) {
  10657. And = SecondOp;
  10658. Shift = FirstOp;
  10659. } else
  10660. return SDValue();
  10661. bool IsAnd = And.getOpcode() == ISD::AND;
  10662. bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR;
  10663. // Is the shift amount constant?
  10664. ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
  10665. if (!C2node)
  10666. return SDValue();
  10667. uint64_t C1;
  10668. if (IsAnd) {
  10669. // Is the and mask vector all constant?
  10670. if (!isAllConstantBuildVector(And.getOperand(1), C1))
  10671. return SDValue();
  10672. } else {
  10673. // Reconstruct the corresponding AND immediate from the two BICi immediates.
  10674. ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
  10675. ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
  10676. assert(C1nodeImm && C1nodeShift);
  10677. C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue());
  10678. }
  10679. // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
  10680. // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
  10681. // how much one can shift elements of a particular size?
  10682. uint64_t C2 = C2node->getZExtValue();
  10683. unsigned ElemSizeInBits = VT.getScalarSizeInBits();
  10684. if (C2 > ElemSizeInBits)
  10685. return SDValue();
  10686. APInt C1AsAPInt(ElemSizeInBits, C1);
  10687. APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
  10688. : APInt::getLowBitsSet(ElemSizeInBits, C2);
  10689. if (C1AsAPInt != RequiredC1)
  10690. return SDValue();
  10691. SDValue X = And.getOperand(0);
  10692. SDValue Y = Shift.getOperand(0);
  10693. unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
  10694. SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1));
  10695. LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
  10696. LLVM_DEBUG(N->dump(&DAG));
  10697. LLVM_DEBUG(dbgs() << "into: \n");
  10698. LLVM_DEBUG(ResultSLI->dump(&DAG));
  10699. ++NumShiftInserts;
  10700. return ResultSLI;
  10701. }
  10702. SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
  10703. SelectionDAG &DAG) const {
  10704. if (useSVEForFixedLengthVectorVT(Op.getValueType(),
  10705. Subtarget->forceStreamingCompatibleSVE()))
  10706. return LowerToScalableOp(Op, DAG);
  10707. // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
  10708. if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
  10709. return Res;
  10710. EVT VT = Op.getValueType();
  10711. SDValue LHS = Op.getOperand(0);
  10712. BuildVectorSDNode *BVN =
  10713. dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
  10714. if (!BVN) {
  10715. // OR commutes, so try swapping the operands.
  10716. LHS = Op.getOperand(1);
  10717. BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
  10718. }
  10719. if (!BVN)
  10720. return Op;
  10721. APInt DefBits(VT.getSizeInBits(), 0);
  10722. APInt UndefBits(VT.getSizeInBits(), 0);
  10723. if (resolveBuildVector(BVN, DefBits, UndefBits)) {
  10724. SDValue NewOp;
  10725. if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
  10726. DefBits, &LHS)) ||
  10727. (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
  10728. DefBits, &LHS)))
  10729. return NewOp;
  10730. if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
  10731. UndefBits, &LHS)) ||
  10732. (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
  10733. UndefBits, &LHS)))
  10734. return NewOp;
  10735. }
  10736. // We can always fall back to a non-immediate OR.
  10737. return Op;
  10738. }
  10739. // Normalize the operands of BUILD_VECTOR. The value of constant operands will
  10740. // be truncated to fit element width.
  10741. static SDValue NormalizeBuildVector(SDValue Op,
  10742. SelectionDAG &DAG) {
  10743. assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
  10744. SDLoc dl(Op);
  10745. EVT VT = Op.getValueType();
  10746. EVT EltTy= VT.getVectorElementType();
  10747. if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
  10748. return Op;
  10749. SmallVector<SDValue, 16> Ops;
  10750. for (SDValue Lane : Op->ops()) {
  10751. // For integer vectors, type legalization would have promoted the
  10752. // operands already. Otherwise, if Op is a floating-point splat
  10753. // (with operands cast to integers), then the only possibilities
  10754. // are constants and UNDEFs.
  10755. if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
  10756. APInt LowBits(EltTy.getSizeInBits(),
  10757. CstLane->getZExtValue());
  10758. Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
  10759. } else if (Lane.getNode()->isUndef()) {
  10760. Lane = DAG.getUNDEF(MVT::i32);
  10761. } else {
  10762. assert(Lane.getValueType() == MVT::i32 &&
  10763. "Unexpected BUILD_VECTOR operand type");
  10764. }
  10765. Ops.push_back(Lane);
  10766. }
  10767. return DAG.getBuildVector(VT, dl, Ops);
  10768. }
  10769. static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) {
  10770. EVT VT = Op.getValueType();
  10771. APInt DefBits(VT.getSizeInBits(), 0);
  10772. APInt UndefBits(VT.getSizeInBits(), 0);
  10773. BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
  10774. if (resolveBuildVector(BVN, DefBits, UndefBits)) {
  10775. SDValue NewOp;
  10776. if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
  10777. (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
  10778. (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
  10779. (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
  10780. (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
  10781. (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
  10782. return NewOp;
  10783. DefBits = ~DefBits;
  10784. if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
  10785. (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
  10786. (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
  10787. return NewOp;
  10788. DefBits = UndefBits;
  10789. if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
  10790. (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
  10791. (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
  10792. (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
  10793. (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
  10794. (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
  10795. return NewOp;
  10796. DefBits = ~UndefBits;
  10797. if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
  10798. (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
  10799. (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
  10800. return NewOp;
  10801. }
  10802. return SDValue();
  10803. }
  10804. SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
  10805. SelectionDAG &DAG) const {
  10806. EVT VT = Op.getValueType();
  10807. if (useSVEForFixedLengthVectorVT(VT,
  10808. Subtarget->forceStreamingCompatibleSVE())) {
  10809. if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
  10810. SDLoc DL(Op);
  10811. EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
  10812. SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
  10813. SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
  10814. SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
  10815. return convertFromScalableVector(DAG, Op.getValueType(), Seq);
  10816. }
  10817. // Revert to common legalisation for all other variants.
  10818. return SDValue();
  10819. }
  10820. // Try to build a simple constant vector.
  10821. Op = NormalizeBuildVector(Op, DAG);
  10822. // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
  10823. // abort.
  10824. if (Op.getOpcode() != ISD::BUILD_VECTOR)
  10825. return SDValue();
  10826. if (VT.isInteger()) {
  10827. // Certain vector constants, used to express things like logical NOT and
  10828. // arithmetic NEG, are passed through unmodified. This allows special
  10829. // patterns for these operations to match, which will lower these constants
  10830. // to whatever is proven necessary.
  10831. BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
  10832. if (BVN->isConstant())
  10833. if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
  10834. unsigned BitSize = VT.getVectorElementType().getSizeInBits();
  10835. APInt Val(BitSize,
  10836. Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
  10837. if (Val.isZero() || Val.isAllOnes())
  10838. return Op;
  10839. }
  10840. }
  10841. if (SDValue V = ConstantBuildVector(Op, DAG))
  10842. return V;
  10843. // Scan through the operands to find some interesting properties we can
  10844. // exploit:
  10845. // 1) If only one value is used, we can use a DUP, or
  10846. // 2) if only the low element is not undef, we can just insert that, or
  10847. // 3) if only one constant value is used (w/ some non-constant lanes),
  10848. // we can splat the constant value into the whole vector then fill
  10849. // in the non-constant lanes.
  10850. // 4) FIXME: If different constant values are used, but we can intelligently
  10851. // select the values we'll be overwriting for the non-constant
  10852. // lanes such that we can directly materialize the vector
  10853. // some other way (MOVI, e.g.), we can be sneaky.
  10854. // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
  10855. SDLoc dl(Op);
  10856. unsigned NumElts = VT.getVectorNumElements();
  10857. bool isOnlyLowElement = true;
  10858. bool usesOnlyOneValue = true;
  10859. bool usesOnlyOneConstantValue = true;
  10860. bool isConstant = true;
  10861. bool AllLanesExtractElt = true;
  10862. unsigned NumConstantLanes = 0;
  10863. unsigned NumDifferentLanes = 0;
  10864. unsigned NumUndefLanes = 0;
  10865. SDValue Value;
  10866. SDValue ConstantValue;
  10867. for (unsigned i = 0; i < NumElts; ++i) {
  10868. SDValue V = Op.getOperand(i);
  10869. if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
  10870. AllLanesExtractElt = false;
  10871. if (V.isUndef()) {
  10872. ++NumUndefLanes;
  10873. continue;
  10874. }
  10875. if (i > 0)
  10876. isOnlyLowElement = false;
  10877. if (!isIntOrFPConstant(V))
  10878. isConstant = false;
  10879. if (isIntOrFPConstant(V)) {
  10880. ++NumConstantLanes;
  10881. if (!ConstantValue.getNode())
  10882. ConstantValue = V;
  10883. else if (ConstantValue != V)
  10884. usesOnlyOneConstantValue = false;
  10885. }
  10886. if (!Value.getNode())
  10887. Value = V;
  10888. else if (V != Value) {
  10889. usesOnlyOneValue = false;
  10890. ++NumDifferentLanes;
  10891. }
  10892. }
  10893. if (!Value.getNode()) {
  10894. LLVM_DEBUG(
  10895. dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
  10896. return DAG.getUNDEF(VT);
  10897. }
  10898. // Convert BUILD_VECTOR where all elements but the lowest are undef into
  10899. // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
  10900. // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
  10901. if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
  10902. LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
  10903. "SCALAR_TO_VECTOR node\n");
  10904. return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
  10905. }
  10906. if (AllLanesExtractElt) {
  10907. SDNode *Vector = nullptr;
  10908. bool Even = false;
  10909. bool Odd = false;
  10910. // Check whether the extract elements match the Even pattern <0,2,4,...> or
  10911. // the Odd pattern <1,3,5,...>.
  10912. for (unsigned i = 0; i < NumElts; ++i) {
  10913. SDValue V = Op.getOperand(i);
  10914. const SDNode *N = V.getNode();
  10915. if (!isa<ConstantSDNode>(N->getOperand(1)))
  10916. break;
  10917. SDValue N0 = N->getOperand(0);
  10918. // All elements are extracted from the same vector.
  10919. if (!Vector) {
  10920. Vector = N0.getNode();
  10921. // Check that the type of EXTRACT_VECTOR_ELT matches the type of
  10922. // BUILD_VECTOR.
  10923. if (VT.getVectorElementType() !=
  10924. N0.getValueType().getVectorElementType())
  10925. break;
  10926. } else if (Vector != N0.getNode()) {
  10927. Odd = false;
  10928. Even = false;
  10929. break;
  10930. }
  10931. // Extracted values are either at Even indices <0,2,4,...> or at Odd
  10932. // indices <1,3,5,...>.
  10933. uint64_t Val = N->getConstantOperandVal(1);
  10934. if (Val == 2 * i) {
  10935. Even = true;
  10936. continue;
  10937. }
  10938. if (Val - 1 == 2 * i) {
  10939. Odd = true;
  10940. continue;
  10941. }
  10942. // Something does not match: abort.
  10943. Odd = false;
  10944. Even = false;
  10945. break;
  10946. }
  10947. if (Even || Odd) {
  10948. SDValue LHS =
  10949. DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
  10950. DAG.getConstant(0, dl, MVT::i64));
  10951. SDValue RHS =
  10952. DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
  10953. DAG.getConstant(NumElts, dl, MVT::i64));
  10954. if (Even && !Odd)
  10955. return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
  10956. RHS);
  10957. if (Odd && !Even)
  10958. return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
  10959. RHS);
  10960. }
  10961. }
  10962. // Use DUP for non-constant splats. For f32 constant splats, reduce to
  10963. // i32 and try again.
  10964. if (usesOnlyOneValue) {
  10965. if (!isConstant) {
  10966. if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
  10967. Value.getValueType() != VT) {
  10968. LLVM_DEBUG(
  10969. dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
  10970. return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
  10971. }
  10972. // This is actually a DUPLANExx operation, which keeps everything vectory.
  10973. SDValue Lane = Value.getOperand(1);
  10974. Value = Value.getOperand(0);
  10975. if (Value.getValueSizeInBits() == 64) {
  10976. LLVM_DEBUG(
  10977. dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
  10978. "widening it\n");
  10979. Value = WidenVector(Value, DAG);
  10980. }
  10981. unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
  10982. return DAG.getNode(Opcode, dl, VT, Value, Lane);
  10983. }
  10984. if (VT.getVectorElementType().isFloatingPoint()) {
  10985. SmallVector<SDValue, 8> Ops;
  10986. EVT EltTy = VT.getVectorElementType();
  10987. assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
  10988. EltTy == MVT::f64) && "Unsupported floating-point vector type");
  10989. LLVM_DEBUG(
  10990. dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
  10991. "BITCASTS, and try again\n");
  10992. MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
  10993. for (unsigned i = 0; i < NumElts; ++i)
  10994. Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
  10995. EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
  10996. SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
  10997. LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
  10998. Val.dump(););
  10999. Val = LowerBUILD_VECTOR(Val, DAG);
  11000. if (Val.getNode())
  11001. return DAG.getNode(ISD::BITCAST, dl, VT, Val);
  11002. }
  11003. }
  11004. // If we need to insert a small number of different non-constant elements and
  11005. // the vector width is sufficiently large, prefer using DUP with the common
  11006. // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
  11007. // skip the constant lane handling below.
  11008. bool PreferDUPAndInsert =
  11009. !isConstant && NumDifferentLanes >= 1 &&
  11010. NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
  11011. NumDifferentLanes >= NumConstantLanes;
  11012. // If there was only one constant value used and for more than one lane,
  11013. // start by splatting that value, then replace the non-constant lanes. This
  11014. // is better than the default, which will perform a separate initialization
  11015. // for each lane.
  11016. if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
  11017. // Firstly, try to materialize the splat constant.
  11018. SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue),
  11019. Val = ConstantBuildVector(Vec, DAG);
  11020. if (!Val) {
  11021. // Otherwise, materialize the constant and splat it.
  11022. Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
  11023. DAG.ReplaceAllUsesWith(Vec.getNode(), &Val);
  11024. }
  11025. // Now insert the non-constant lanes.
  11026. for (unsigned i = 0; i < NumElts; ++i) {
  11027. SDValue V = Op.getOperand(i);
  11028. SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
  11029. if (!isIntOrFPConstant(V))
  11030. // Note that type legalization likely mucked about with the VT of the
  11031. // source operand, so we may have to convert it here before inserting.
  11032. Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
  11033. }
  11034. return Val;
  11035. }
  11036. // This will generate a load from the constant pool.
  11037. if (isConstant) {
  11038. LLVM_DEBUG(
  11039. dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
  11040. "expansion\n");
  11041. return SDValue();
  11042. }
  11043. // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
  11044. // v4i32s. This is really a truncate, which we can construct out of (legal)
  11045. // concats and truncate nodes.
  11046. if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG))
  11047. return M;
  11048. // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
  11049. if (NumElts >= 4) {
  11050. if (SDValue shuffle = ReconstructShuffle(Op, DAG))
  11051. return shuffle;
  11052. }
  11053. if (PreferDUPAndInsert) {
  11054. // First, build a constant vector with the common element.
  11055. SmallVector<SDValue, 8> Ops(NumElts, Value);
  11056. SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
  11057. // Next, insert the elements that do not match the common value.
  11058. for (unsigned I = 0; I < NumElts; ++I)
  11059. if (Op.getOperand(I) != Value)
  11060. NewVector =
  11061. DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
  11062. Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
  11063. return NewVector;
  11064. }
  11065. // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
  11066. // know the default expansion would otherwise fall back on something even
  11067. // worse. For a vector with one or two non-undef values, that's
  11068. // scalar_to_vector for the elements followed by a shuffle (provided the
  11069. // shuffle is valid for the target) and materialization element by element
  11070. // on the stack followed by a load for everything else.
  11071. if (!isConstant && !usesOnlyOneValue) {
  11072. LLVM_DEBUG(
  11073. dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
  11074. "of INSERT_VECTOR_ELT\n");
  11075. SDValue Vec = DAG.getUNDEF(VT);
  11076. SDValue Op0 = Op.getOperand(0);
  11077. unsigned i = 0;
  11078. // Use SCALAR_TO_VECTOR for lane zero to
  11079. // a) Avoid a RMW dependency on the full vector register, and
  11080. // b) Allow the register coalescer to fold away the copy if the
  11081. // value is already in an S or D register, and we're forced to emit an
  11082. // INSERT_SUBREG that we can't fold anywhere.
  11083. //
  11084. // We also allow types like i8 and i16 which are illegal scalar but legal
  11085. // vector element types. After type-legalization the inserted value is
  11086. // extended (i32) and it is safe to cast them to the vector type by ignoring
  11087. // the upper bits of the lowest lane (e.g. v8i8, v4i16).
  11088. if (!Op0.isUndef()) {
  11089. LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
  11090. Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
  11091. ++i;
  11092. }
  11093. LLVM_DEBUG(if (i < NumElts) dbgs()
  11094. << "Creating nodes for the other vector elements:\n";);
  11095. for (; i < NumElts; ++i) {
  11096. SDValue V = Op.getOperand(i);
  11097. if (V.isUndef())
  11098. continue;
  11099. SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
  11100. Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
  11101. }
  11102. return Vec;
  11103. }
  11104. LLVM_DEBUG(
  11105. dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
  11106. "better alternative\n");
  11107. return SDValue();
  11108. }
  11109. SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
  11110. SelectionDAG &DAG) const {
  11111. if (useSVEForFixedLengthVectorVT(Op.getValueType(),
  11112. Subtarget->forceStreamingCompatibleSVE()))
  11113. return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
  11114. assert(Op.getValueType().isScalableVector() &&
  11115. isTypeLegal(Op.getValueType()) &&
  11116. "Expected legal scalable vector type!");
  11117. if (isTypeLegal(Op.getOperand(0).getValueType())) {
  11118. unsigned NumOperands = Op->getNumOperands();
  11119. assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
  11120. "Unexpected number of operands in CONCAT_VECTORS");
  11121. if (NumOperands == 2)
  11122. return Op;
  11123. // Concat each pair of subvectors and pack into the lower half of the array.
  11124. SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
  11125. while (ConcatOps.size() > 1) {
  11126. for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
  11127. SDValue V1 = ConcatOps[I];
  11128. SDValue V2 = ConcatOps[I + 1];
  11129. EVT SubVT = V1.getValueType();
  11130. EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
  11131. ConcatOps[I / 2] =
  11132. DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
  11133. }
  11134. ConcatOps.resize(ConcatOps.size() / 2);
  11135. }
  11136. return ConcatOps[0];
  11137. }
  11138. return SDValue();
  11139. }
  11140. SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
  11141. SelectionDAG &DAG) const {
  11142. assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
  11143. if (useSVEForFixedLengthVectorVT(Op.getValueType(),
  11144. Subtarget->forceStreamingCompatibleSVE()))
  11145. return LowerFixedLengthInsertVectorElt(Op, DAG);
  11146. // Check for non-constant or out of range lane.
  11147. EVT VT = Op.getOperand(0).getValueType();
  11148. if (VT.getScalarType() == MVT::i1) {
  11149. EVT VectorVT = getPromotedVTForPredicate(VT);
  11150. SDLoc DL(Op);
  11151. SDValue ExtendedVector =
  11152. DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
  11153. SDValue ExtendedValue =
  11154. DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
  11155. VectorVT.getScalarType().getSizeInBits() < 32
  11156. ? MVT::i32
  11157. : VectorVT.getScalarType());
  11158. ExtendedVector =
  11159. DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
  11160. ExtendedValue, Op.getOperand(2));
  11161. return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
  11162. }
  11163. ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
  11164. if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
  11165. return SDValue();
  11166. // Insertion/extraction are legal for V128 types.
  11167. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
  11168. VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
  11169. VT == MVT::v8f16 || VT == MVT::v8bf16)
  11170. return Op;
  11171. if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
  11172. VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
  11173. VT != MVT::v4bf16)
  11174. return SDValue();
  11175. // For V64 types, we perform insertion by expanding the value
  11176. // to a V128 type and perform the insertion on that.
  11177. SDLoc DL(Op);
  11178. SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
  11179. EVT WideTy = WideVec.getValueType();
  11180. SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
  11181. Op.getOperand(1), Op.getOperand(2));
  11182. // Re-narrow the resultant vector.
  11183. return NarrowVector(Node, DAG);
  11184. }
  11185. SDValue
  11186. AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
  11187. SelectionDAG &DAG) const {
  11188. assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
  11189. EVT VT = Op.getOperand(0).getValueType();
  11190. if (VT.getScalarType() == MVT::i1) {
  11191. // We can't directly extract from an SVE predicate; extend it first.
  11192. // (This isn't the only possible lowering, but it's straightforward.)
  11193. EVT VectorVT = getPromotedVTForPredicate(VT);
  11194. SDLoc DL(Op);
  11195. SDValue Extend =
  11196. DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
  11197. MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
  11198. SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
  11199. Extend, Op.getOperand(1));
  11200. return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
  11201. }
  11202. if (useSVEForFixedLengthVectorVT(VT,
  11203. Subtarget->forceStreamingCompatibleSVE()))
  11204. return LowerFixedLengthExtractVectorElt(Op, DAG);
  11205. // Check for non-constant or out of range lane.
  11206. ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
  11207. if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
  11208. return SDValue();
  11209. // Insertion/extraction are legal for V128 types.
  11210. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
  11211. VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
  11212. VT == MVT::v8f16 || VT == MVT::v8bf16)
  11213. return Op;
  11214. if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
  11215. VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
  11216. VT != MVT::v4bf16)
  11217. return SDValue();
  11218. // For V64 types, we perform extraction by expanding the value
  11219. // to a V128 type and perform the extraction on that.
  11220. SDLoc DL(Op);
  11221. SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
  11222. EVT WideTy = WideVec.getValueType();
  11223. EVT ExtrTy = WideTy.getVectorElementType();
  11224. if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
  11225. ExtrTy = MVT::i32;
  11226. // For extractions, we just return the result directly.
  11227. return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
  11228. Op.getOperand(1));
  11229. }
  11230. SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
  11231. SelectionDAG &DAG) const {
  11232. assert(Op.getValueType().isFixedLengthVector() &&
  11233. "Only cases that extract a fixed length vector are supported!");
  11234. EVT InVT = Op.getOperand(0).getValueType();
  11235. unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
  11236. unsigned Size = Op.getValueSizeInBits();
  11237. // If we don't have legal types yet, do nothing
  11238. if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
  11239. return SDValue();
  11240. if (InVT.isScalableVector()) {
  11241. // This will be matched by custom code during ISelDAGToDAG.
  11242. if (Idx == 0 && isPackedVectorType(InVT, DAG))
  11243. return Op;
  11244. return SDValue();
  11245. }
  11246. // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
  11247. if (Idx == 0 && InVT.getSizeInBits() <= 128)
  11248. return Op;
  11249. // If this is extracting the upper 64-bits of a 128-bit vector, we match
  11250. // that directly.
  11251. if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
  11252. InVT.getSizeInBits() == 128 && !Subtarget->forceStreamingCompatibleSVE())
  11253. return Op;
  11254. if (useSVEForFixedLengthVectorVT(InVT,
  11255. Subtarget->forceStreamingCompatibleSVE())) {
  11256. SDLoc DL(Op);
  11257. EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
  11258. SDValue NewInVec =
  11259. convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
  11260. SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ContainerVT, NewInVec,
  11261. NewInVec, DAG.getConstant(Idx, DL, MVT::i64));
  11262. return convertFromScalableVector(DAG, Op.getValueType(), Splice);
  11263. }
  11264. return SDValue();
  11265. }
  11266. SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
  11267. SelectionDAG &DAG) const {
  11268. assert(Op.getValueType().isScalableVector() &&
  11269. "Only expect to lower inserts into scalable vectors!");
  11270. EVT InVT = Op.getOperand(1).getValueType();
  11271. unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
  11272. SDValue Vec0 = Op.getOperand(0);
  11273. SDValue Vec1 = Op.getOperand(1);
  11274. SDLoc DL(Op);
  11275. EVT VT = Op.getValueType();
  11276. if (InVT.isScalableVector()) {
  11277. if (!isTypeLegal(VT))
  11278. return SDValue();
  11279. // Break down insert_subvector into simpler parts.
  11280. if (VT.getVectorElementType() == MVT::i1) {
  11281. unsigned NumElts = VT.getVectorMinNumElements();
  11282. EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
  11283. SDValue Lo, Hi;
  11284. Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
  11285. DAG.getVectorIdxConstant(0, DL));
  11286. Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
  11287. DAG.getVectorIdxConstant(NumElts / 2, DL));
  11288. if (Idx < (NumElts / 2)) {
  11289. SDValue NewLo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
  11290. DAG.getVectorIdxConstant(Idx, DL));
  11291. return DAG.getNode(AArch64ISD::UZP1, DL, VT, NewLo, Hi);
  11292. } else {
  11293. SDValue NewHi =
  11294. DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
  11295. DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
  11296. return DAG.getNode(AArch64ISD::UZP1, DL, VT, Lo, NewHi);
  11297. }
  11298. }
  11299. // Ensure the subvector is half the size of the main vector.
  11300. if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
  11301. return SDValue();
  11302. // Here narrow and wide refers to the vector element types. After "casting"
  11303. // both vectors must have the same bit length and so because the subvector
  11304. // has fewer elements, those elements need to be bigger.
  11305. EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount());
  11306. EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount());
  11307. // NOP cast operands to the largest legal vector of the same element count.
  11308. if (VT.isFloatingPoint()) {
  11309. Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
  11310. Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG);
  11311. } else {
  11312. // Legal integer vectors are already their largest so Vec0 is fine as is.
  11313. Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
  11314. }
  11315. // To replace the top/bottom half of vector V with vector SubV we widen the
  11316. // preserved half of V, concatenate this to SubV (the order depending on the
  11317. // half being replaced) and then narrow the result.
  11318. SDValue Narrow;
  11319. if (Idx == 0) {
  11320. SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
  11321. Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
  11322. } else {
  11323. assert(Idx == InVT.getVectorMinNumElements() &&
  11324. "Invalid subvector index!");
  11325. SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
  11326. Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
  11327. }
  11328. return getSVESafeBitCast(VT, Narrow, DAG);
  11329. }
  11330. if (Idx == 0 && isPackedVectorType(VT, DAG)) {
  11331. // This will be matched by custom code during ISelDAGToDAG.
  11332. if (Vec0.isUndef())
  11333. return Op;
  11334. std::optional<unsigned> PredPattern =
  11335. getSVEPredPatternFromNumElements(InVT.getVectorNumElements());
  11336. auto PredTy = VT.changeVectorElementType(MVT::i1);
  11337. SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
  11338. SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
  11339. return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
  11340. }
  11341. return SDValue();
  11342. }
  11343. static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
  11344. if (Op.getOpcode() != AArch64ISD::DUP &&
  11345. Op.getOpcode() != ISD::SPLAT_VECTOR &&
  11346. Op.getOpcode() != ISD::BUILD_VECTOR)
  11347. return false;
  11348. if (Op.getOpcode() == ISD::BUILD_VECTOR &&
  11349. !isAllConstantBuildVector(Op, SplatVal))
  11350. return false;
  11351. if (Op.getOpcode() != ISD::BUILD_VECTOR &&
  11352. !isa<ConstantSDNode>(Op->getOperand(0)))
  11353. return false;
  11354. SplatVal = Op->getConstantOperandVal(0);
  11355. if (Op.getValueType().getVectorElementType() != MVT::i64)
  11356. SplatVal = (int32_t)SplatVal;
  11357. Negated = false;
  11358. if (isPowerOf2_64(SplatVal))
  11359. return true;
  11360. Negated = true;
  11361. if (isPowerOf2_64(-SplatVal)) {
  11362. SplatVal = -SplatVal;
  11363. return true;
  11364. }
  11365. return false;
  11366. }
  11367. SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
  11368. EVT VT = Op.getValueType();
  11369. SDLoc dl(Op);
  11370. if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
  11371. return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
  11372. assert(VT.isScalableVector() && "Expected a scalable vector.");
  11373. bool Signed = Op.getOpcode() == ISD::SDIV;
  11374. unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
  11375. bool Negated;
  11376. uint64_t SplatVal;
  11377. if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
  11378. SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
  11379. SDValue Res =
  11380. DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
  11381. DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
  11382. if (Negated)
  11383. Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
  11384. return Res;
  11385. }
  11386. if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
  11387. return LowerToPredicatedOp(Op, DAG, PredOpcode);
  11388. // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
  11389. // operations, and truncate the result.
  11390. EVT WidenedVT;
  11391. if (VT == MVT::nxv16i8)
  11392. WidenedVT = MVT::nxv8i16;
  11393. else if (VT == MVT::nxv8i16)
  11394. WidenedVT = MVT::nxv4i32;
  11395. else
  11396. llvm_unreachable("Unexpected Custom DIV operation");
  11397. unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
  11398. unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
  11399. SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
  11400. SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
  11401. SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
  11402. SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
  11403. SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
  11404. SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
  11405. return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
  11406. }
  11407. bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
  11408. // Currently no fixed length shuffles that require SVE are legal.
  11409. if (useSVEForFixedLengthVectorVT(VT,
  11410. Subtarget->forceStreamingCompatibleSVE()))
  11411. return false;
  11412. if (VT.getVectorNumElements() == 4 &&
  11413. (VT.is128BitVector() || VT.is64BitVector())) {
  11414. unsigned Cost = getPerfectShuffleCost(M);
  11415. if (Cost <= 1)
  11416. return true;
  11417. }
  11418. bool DummyBool;
  11419. int DummyInt;
  11420. unsigned DummyUnsigned;
  11421. return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
  11422. isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
  11423. isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
  11424. // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
  11425. isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
  11426. isZIPMask(M, VT, DummyUnsigned) ||
  11427. isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
  11428. isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
  11429. isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
  11430. isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
  11431. isConcatMask(M, VT, VT.getSizeInBits() == 128));
  11432. }
  11433. bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef<int> M,
  11434. EVT VT) const {
  11435. // Just delegate to the generic legality, clear masks aren't special.
  11436. return isShuffleMaskLegal(M, VT);
  11437. }
  11438. /// getVShiftImm - Check if this is a valid build_vector for the immediate
  11439. /// operand of a vector shift operation, where all the elements of the
  11440. /// build_vector must have the same constant integer value.
  11441. static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
  11442. // Ignore bit_converts.
  11443. while (Op.getOpcode() == ISD::BITCAST)
  11444. Op = Op.getOperand(0);
  11445. BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
  11446. APInt SplatBits, SplatUndef;
  11447. unsigned SplatBitSize;
  11448. bool HasAnyUndefs;
  11449. if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
  11450. HasAnyUndefs, ElementBits) ||
  11451. SplatBitSize > ElementBits)
  11452. return false;
  11453. Cnt = SplatBits.getSExtValue();
  11454. return true;
  11455. }
  11456. /// isVShiftLImm - Check if this is a valid build_vector for the immediate
  11457. /// operand of a vector shift left operation. That value must be in the range:
  11458. /// 0 <= Value < ElementBits for a left shift; or
  11459. /// 0 <= Value <= ElementBits for a long left shift.
  11460. static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
  11461. assert(VT.isVector() && "vector shift count is not a vector type");
  11462. int64_t ElementBits = VT.getScalarSizeInBits();
  11463. if (!getVShiftImm(Op, ElementBits, Cnt))
  11464. return false;
  11465. return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
  11466. }
  11467. /// isVShiftRImm - Check if this is a valid build_vector for the immediate
  11468. /// operand of a vector shift right operation. The value must be in the range:
  11469. /// 1 <= Value <= ElementBits for a right shift; or
  11470. static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
  11471. assert(VT.isVector() && "vector shift count is not a vector type");
  11472. int64_t ElementBits = VT.getScalarSizeInBits();
  11473. if (!getVShiftImm(Op, ElementBits, Cnt))
  11474. return false;
  11475. return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
  11476. }
  11477. SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
  11478. SelectionDAG &DAG) const {
  11479. EVT VT = Op.getValueType();
  11480. if (VT.getScalarType() == MVT::i1) {
  11481. // Lower i1 truncate to `(x & 1) != 0`.
  11482. SDLoc dl(Op);
  11483. EVT OpVT = Op.getOperand(0).getValueType();
  11484. SDValue Zero = DAG.getConstant(0, dl, OpVT);
  11485. SDValue One = DAG.getConstant(1, dl, OpVT);
  11486. SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
  11487. return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
  11488. }
  11489. if (!VT.isVector() || VT.isScalableVector())
  11490. return SDValue();
  11491. if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
  11492. Subtarget->forceStreamingCompatibleSVE()))
  11493. return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
  11494. return SDValue();
  11495. }
  11496. SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
  11497. SelectionDAG &DAG) const {
  11498. EVT VT = Op.getValueType();
  11499. SDLoc DL(Op);
  11500. int64_t Cnt;
  11501. if (!Op.getOperand(1).getValueType().isVector())
  11502. return Op;
  11503. unsigned EltSize = VT.getScalarSizeInBits();
  11504. switch (Op.getOpcode()) {
  11505. case ISD::SHL:
  11506. if (VT.isScalableVector() ||
  11507. useSVEForFixedLengthVectorVT(VT,
  11508. Subtarget->forceStreamingCompatibleSVE()))
  11509. return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
  11510. if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
  11511. return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
  11512. DAG.getConstant(Cnt, DL, MVT::i32));
  11513. return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
  11514. DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
  11515. MVT::i32),
  11516. Op.getOperand(0), Op.getOperand(1));
  11517. case ISD::SRA:
  11518. case ISD::SRL:
  11519. if (VT.isScalableVector() ||
  11520. useSVEForFixedLengthVectorVT(
  11521. VT, Subtarget->forceStreamingCompatibleSVE())) {
  11522. unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
  11523. : AArch64ISD::SRL_PRED;
  11524. return LowerToPredicatedOp(Op, DAG, Opc);
  11525. }
  11526. // Right shift immediate
  11527. if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
  11528. unsigned Opc =
  11529. (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
  11530. return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
  11531. DAG.getConstant(Cnt, DL, MVT::i32));
  11532. }
  11533. // Right shift register. Note, there is not a shift right register
  11534. // instruction, but the shift left register instruction takes a signed
  11535. // value, where negative numbers specify a right shift.
  11536. unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
  11537. : Intrinsic::aarch64_neon_ushl;
  11538. // negate the shift amount
  11539. SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
  11540. Op.getOperand(1));
  11541. SDValue NegShiftLeft =
  11542. DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
  11543. DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
  11544. NegShift);
  11545. return NegShiftLeft;
  11546. }
  11547. llvm_unreachable("unexpected shift opcode");
  11548. }
  11549. static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
  11550. AArch64CC::CondCode CC, bool NoNans, EVT VT,
  11551. const SDLoc &dl, SelectionDAG &DAG) {
  11552. EVT SrcVT = LHS.getValueType();
  11553. assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
  11554. "function only supposed to emit natural comparisons");
  11555. BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
  11556. APInt CnstBits(VT.getSizeInBits(), 0);
  11557. APInt UndefBits(VT.getSizeInBits(), 0);
  11558. bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
  11559. bool IsZero = IsCnst && (CnstBits == 0);
  11560. if (SrcVT.getVectorElementType().isFloatingPoint()) {
  11561. switch (CC) {
  11562. default:
  11563. return SDValue();
  11564. case AArch64CC::NE: {
  11565. SDValue Fcmeq;
  11566. if (IsZero)
  11567. Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
  11568. else
  11569. Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
  11570. return DAG.getNOT(dl, Fcmeq, VT);
  11571. }
  11572. case AArch64CC::EQ:
  11573. if (IsZero)
  11574. return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
  11575. return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
  11576. case AArch64CC::GE:
  11577. if (IsZero)
  11578. return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
  11579. return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
  11580. case AArch64CC::GT:
  11581. if (IsZero)
  11582. return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
  11583. return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
  11584. case AArch64CC::LE:
  11585. if (!NoNans)
  11586. return SDValue();
  11587. // If we ignore NaNs then we can use to the LS implementation.
  11588. [[fallthrough]];
  11589. case AArch64CC::LS:
  11590. if (IsZero)
  11591. return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
  11592. return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
  11593. case AArch64CC::LT:
  11594. if (!NoNans)
  11595. return SDValue();
  11596. // If we ignore NaNs then we can use to the MI implementation.
  11597. [[fallthrough]];
  11598. case AArch64CC::MI:
  11599. if (IsZero)
  11600. return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
  11601. return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
  11602. }
  11603. }
  11604. switch (CC) {
  11605. default:
  11606. return SDValue();
  11607. case AArch64CC::NE: {
  11608. SDValue Cmeq;
  11609. if (IsZero)
  11610. Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
  11611. else
  11612. Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
  11613. return DAG.getNOT(dl, Cmeq, VT);
  11614. }
  11615. case AArch64CC::EQ:
  11616. if (IsZero)
  11617. return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
  11618. return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
  11619. case AArch64CC::GE:
  11620. if (IsZero)
  11621. return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
  11622. return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
  11623. case AArch64CC::GT:
  11624. if (IsZero)
  11625. return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
  11626. return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
  11627. case AArch64CC::LE:
  11628. if (IsZero)
  11629. return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
  11630. return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
  11631. case AArch64CC::LS:
  11632. return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
  11633. case AArch64CC::LO:
  11634. return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
  11635. case AArch64CC::LT:
  11636. if (IsZero)
  11637. return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
  11638. return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
  11639. case AArch64CC::HI:
  11640. return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
  11641. case AArch64CC::HS:
  11642. return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
  11643. }
  11644. }
  11645. SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
  11646. SelectionDAG &DAG) const {
  11647. if (Op.getValueType().isScalableVector())
  11648. return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
  11649. if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
  11650. Subtarget->forceStreamingCompatibleSVE()))
  11651. return LowerFixedLengthVectorSetccToSVE(Op, DAG);
  11652. ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
  11653. SDValue LHS = Op.getOperand(0);
  11654. SDValue RHS = Op.getOperand(1);
  11655. EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
  11656. SDLoc dl(Op);
  11657. if (LHS.getValueType().getVectorElementType().isInteger()) {
  11658. assert(LHS.getValueType() == RHS.getValueType());
  11659. AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
  11660. SDValue Cmp =
  11661. EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
  11662. return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
  11663. }
  11664. const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
  11665. // Make v4f16 (only) fcmp operations utilise vector instructions
  11666. // v8f16 support will be a litle more complicated
  11667. if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) {
  11668. if (LHS.getValueType().getVectorNumElements() == 4) {
  11669. LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
  11670. RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
  11671. SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
  11672. DAG.ReplaceAllUsesWith(Op, NewSetcc);
  11673. CmpVT = MVT::v4i32;
  11674. } else
  11675. return SDValue();
  11676. }
  11677. assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
  11678. LHS.getValueType().getVectorElementType() != MVT::f128);
  11679. // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
  11680. // clean. Some of them require two branches to implement.
  11681. AArch64CC::CondCode CC1, CC2;
  11682. bool ShouldInvert;
  11683. changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
  11684. bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
  11685. SDValue Cmp =
  11686. EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
  11687. if (!Cmp.getNode())
  11688. return SDValue();
  11689. if (CC2 != AArch64CC::AL) {
  11690. SDValue Cmp2 =
  11691. EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
  11692. if (!Cmp2.getNode())
  11693. return SDValue();
  11694. Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
  11695. }
  11696. Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
  11697. if (ShouldInvert)
  11698. Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
  11699. return Cmp;
  11700. }
  11701. static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
  11702. SelectionDAG &DAG) {
  11703. SDValue VecOp = ScalarOp.getOperand(0);
  11704. auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
  11705. return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
  11706. DAG.getConstant(0, DL, MVT::i64));
  11707. }
  11708. SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
  11709. SelectionDAG &DAG) const {
  11710. SDValue Src = Op.getOperand(0);
  11711. // Try to lower fixed length reductions to SVE.
  11712. EVT SrcVT = Src.getValueType();
  11713. bool OverrideNEON = Subtarget->forceStreamingCompatibleSVE() ||
  11714. Op.getOpcode() == ISD::VECREDUCE_AND ||
  11715. Op.getOpcode() == ISD::VECREDUCE_OR ||
  11716. Op.getOpcode() == ISD::VECREDUCE_XOR ||
  11717. Op.getOpcode() == ISD::VECREDUCE_FADD ||
  11718. (Op.getOpcode() != ISD::VECREDUCE_ADD &&
  11719. SrcVT.getVectorElementType() == MVT::i64);
  11720. if (SrcVT.isScalableVector() ||
  11721. useSVEForFixedLengthVectorVT(
  11722. SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
  11723. if (SrcVT.getVectorElementType() == MVT::i1)
  11724. return LowerPredReductionToSVE(Op, DAG);
  11725. switch (Op.getOpcode()) {
  11726. case ISD::VECREDUCE_ADD:
  11727. return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
  11728. case ISD::VECREDUCE_AND:
  11729. return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
  11730. case ISD::VECREDUCE_OR:
  11731. return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
  11732. case ISD::VECREDUCE_SMAX:
  11733. return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
  11734. case ISD::VECREDUCE_SMIN:
  11735. return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
  11736. case ISD::VECREDUCE_UMAX:
  11737. return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
  11738. case ISD::VECREDUCE_UMIN:
  11739. return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
  11740. case ISD::VECREDUCE_XOR:
  11741. return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
  11742. case ISD::VECREDUCE_FADD:
  11743. return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
  11744. case ISD::VECREDUCE_FMAX:
  11745. return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
  11746. case ISD::VECREDUCE_FMIN:
  11747. return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
  11748. default:
  11749. llvm_unreachable("Unhandled fixed length reduction");
  11750. }
  11751. }
  11752. // Lower NEON reductions.
  11753. SDLoc dl(Op);
  11754. switch (Op.getOpcode()) {
  11755. case ISD::VECREDUCE_ADD:
  11756. return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
  11757. case ISD::VECREDUCE_SMAX:
  11758. return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
  11759. case ISD::VECREDUCE_SMIN:
  11760. return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
  11761. case ISD::VECREDUCE_UMAX:
  11762. return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
  11763. case ISD::VECREDUCE_UMIN:
  11764. return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
  11765. case ISD::VECREDUCE_FMAX: {
  11766. return DAG.getNode(
  11767. ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
  11768. DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
  11769. Src);
  11770. }
  11771. case ISD::VECREDUCE_FMIN: {
  11772. return DAG.getNode(
  11773. ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
  11774. DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
  11775. Src);
  11776. }
  11777. default:
  11778. llvm_unreachable("Unhandled reduction");
  11779. }
  11780. }
  11781. SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
  11782. SelectionDAG &DAG) const {
  11783. auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
  11784. if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
  11785. return SDValue();
  11786. // LSE has an atomic load-add instruction, but not a load-sub.
  11787. SDLoc dl(Op);
  11788. MVT VT = Op.getSimpleValueType();
  11789. SDValue RHS = Op.getOperand(2);
  11790. AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
  11791. RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS);
  11792. return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(),
  11793. Op.getOperand(0), Op.getOperand(1), RHS,
  11794. AN->getMemOperand());
  11795. }
  11796. SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
  11797. SelectionDAG &DAG) const {
  11798. auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
  11799. if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
  11800. return SDValue();
  11801. // LSE has an atomic load-clear instruction, but not a load-and.
  11802. SDLoc dl(Op);
  11803. MVT VT = Op.getSimpleValueType();
  11804. SDValue RHS = Op.getOperand(2);
  11805. AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
  11806. RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
  11807. return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
  11808. Op.getOperand(0), Op.getOperand(1), RHS,
  11809. AN->getMemOperand());
  11810. }
  11811. SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
  11812. SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
  11813. SDLoc dl(Op);
  11814. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  11815. SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(),
  11816. PtrVT, 0);
  11817. const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
  11818. const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
  11819. if (Subtarget->hasCustomCallingConv())
  11820. TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
  11821. Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
  11822. DAG.getConstant(4, dl, MVT::i64));
  11823. Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
  11824. Chain =
  11825. DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
  11826. Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
  11827. DAG.getRegisterMask(Mask), Chain.getValue(1));
  11828. // To match the actual intent better, we should read the output from X15 here
  11829. // again (instead of potentially spilling it to the stack), but rereading Size
  11830. // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
  11831. // here.
  11832. Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
  11833. DAG.getConstant(4, dl, MVT::i64));
  11834. return Chain;
  11835. }
  11836. SDValue
  11837. AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
  11838. SelectionDAG &DAG) const {
  11839. assert(Subtarget->isTargetWindows() &&
  11840. "Only Windows alloca probing supported");
  11841. SDLoc dl(Op);
  11842. // Get the inputs.
  11843. SDNode *Node = Op.getNode();
  11844. SDValue Chain = Op.getOperand(0);
  11845. SDValue Size = Op.getOperand(1);
  11846. MaybeAlign Align =
  11847. cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
  11848. EVT VT = Node->getValueType(0);
  11849. if (DAG.getMachineFunction().getFunction().hasFnAttribute(
  11850. "no-stack-arg-probe")) {
  11851. SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
  11852. Chain = SP.getValue(1);
  11853. SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
  11854. if (Align)
  11855. SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
  11856. DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
  11857. Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
  11858. SDValue Ops[2] = {SP, Chain};
  11859. return DAG.getMergeValues(Ops, dl);
  11860. }
  11861. Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
  11862. Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
  11863. SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
  11864. Chain = SP.getValue(1);
  11865. SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
  11866. if (Align)
  11867. SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
  11868. DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
  11869. Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
  11870. Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
  11871. SDValue Ops[2] = {SP, Chain};
  11872. return DAG.getMergeValues(Ops, dl);
  11873. }
  11874. SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
  11875. SelectionDAG &DAG) const {
  11876. EVT VT = Op.getValueType();
  11877. assert(VT != MVT::i64 && "Expected illegal VSCALE node");
  11878. SDLoc DL(Op);
  11879. APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue();
  11880. return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
  11881. VT);
  11882. }
  11883. /// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
  11884. template <unsigned NumVecs>
  11885. static bool
  11886. setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL,
  11887. AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) {
  11888. Info.opc = ISD::INTRINSIC_VOID;
  11889. // Retrieve EC from first vector argument.
  11890. const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
  11891. ElementCount EC = VT.getVectorElementCount();
  11892. #ifndef NDEBUG
  11893. // Check the assumption that all input vectors are the same type.
  11894. for (unsigned I = 0; I < NumVecs; ++I)
  11895. assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
  11896. "Invalid type.");
  11897. #endif
  11898. // memVT is `NumVecs * VT`.
  11899. Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
  11900. EC * NumVecs);
  11901. Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
  11902. Info.offset = 0;
  11903. Info.align.reset();
  11904. Info.flags = MachineMemOperand::MOStore;
  11905. return true;
  11906. }
  11907. /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
  11908. /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
  11909. /// specified in the intrinsic calls.
  11910. bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
  11911. const CallInst &I,
  11912. MachineFunction &MF,
  11913. unsigned Intrinsic) const {
  11914. auto &DL = I.getModule()->getDataLayout();
  11915. switch (Intrinsic) {
  11916. case Intrinsic::aarch64_sve_st2:
  11917. return setInfoSVEStN<2>(*this, DL, Info, I);
  11918. case Intrinsic::aarch64_sve_st3:
  11919. return setInfoSVEStN<3>(*this, DL, Info, I);
  11920. case Intrinsic::aarch64_sve_st4:
  11921. return setInfoSVEStN<4>(*this, DL, Info, I);
  11922. case Intrinsic::aarch64_neon_ld2:
  11923. case Intrinsic::aarch64_neon_ld3:
  11924. case Intrinsic::aarch64_neon_ld4:
  11925. case Intrinsic::aarch64_neon_ld1x2:
  11926. case Intrinsic::aarch64_neon_ld1x3:
  11927. case Intrinsic::aarch64_neon_ld1x4:
  11928. case Intrinsic::aarch64_neon_ld2lane:
  11929. case Intrinsic::aarch64_neon_ld3lane:
  11930. case Intrinsic::aarch64_neon_ld4lane:
  11931. case Intrinsic::aarch64_neon_ld2r:
  11932. case Intrinsic::aarch64_neon_ld3r:
  11933. case Intrinsic::aarch64_neon_ld4r: {
  11934. Info.opc = ISD::INTRINSIC_W_CHAIN;
  11935. // Conservatively set memVT to the entire set of vectors loaded.
  11936. uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
  11937. Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
  11938. Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
  11939. Info.offset = 0;
  11940. Info.align.reset();
  11941. // volatile loads with NEON intrinsics not supported
  11942. Info.flags = MachineMemOperand::MOLoad;
  11943. return true;
  11944. }
  11945. case Intrinsic::aarch64_neon_st2:
  11946. case Intrinsic::aarch64_neon_st3:
  11947. case Intrinsic::aarch64_neon_st4:
  11948. case Intrinsic::aarch64_neon_st1x2:
  11949. case Intrinsic::aarch64_neon_st1x3:
  11950. case Intrinsic::aarch64_neon_st1x4:
  11951. case Intrinsic::aarch64_neon_st2lane:
  11952. case Intrinsic::aarch64_neon_st3lane:
  11953. case Intrinsic::aarch64_neon_st4lane: {
  11954. Info.opc = ISD::INTRINSIC_VOID;
  11955. // Conservatively set memVT to the entire set of vectors stored.
  11956. unsigned NumElts = 0;
  11957. for (const Value *Arg : I.args()) {
  11958. Type *ArgTy = Arg->getType();
  11959. if (!ArgTy->isVectorTy())
  11960. break;
  11961. NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
  11962. }
  11963. Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
  11964. Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
  11965. Info.offset = 0;
  11966. Info.align.reset();
  11967. // volatile stores with NEON intrinsics not supported
  11968. Info.flags = MachineMemOperand::MOStore;
  11969. return true;
  11970. }
  11971. case Intrinsic::aarch64_ldaxr:
  11972. case Intrinsic::aarch64_ldxr: {
  11973. Type *ValTy = I.getParamElementType(0);
  11974. Info.opc = ISD::INTRINSIC_W_CHAIN;
  11975. Info.memVT = MVT::getVT(ValTy);
  11976. Info.ptrVal = I.getArgOperand(0);
  11977. Info.offset = 0;
  11978. Info.align = DL.getABITypeAlign(ValTy);
  11979. Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
  11980. return true;
  11981. }
  11982. case Intrinsic::aarch64_stlxr:
  11983. case Intrinsic::aarch64_stxr: {
  11984. Type *ValTy = I.getParamElementType(1);
  11985. Info.opc = ISD::INTRINSIC_W_CHAIN;
  11986. Info.memVT = MVT::getVT(ValTy);
  11987. Info.ptrVal = I.getArgOperand(1);
  11988. Info.offset = 0;
  11989. Info.align = DL.getABITypeAlign(ValTy);
  11990. Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
  11991. return true;
  11992. }
  11993. case Intrinsic::aarch64_ldaxp:
  11994. case Intrinsic::aarch64_ldxp:
  11995. Info.opc = ISD::INTRINSIC_W_CHAIN;
  11996. Info.memVT = MVT::i128;
  11997. Info.ptrVal = I.getArgOperand(0);
  11998. Info.offset = 0;
  11999. Info.align = Align(16);
  12000. Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
  12001. return true;
  12002. case Intrinsic::aarch64_stlxp:
  12003. case Intrinsic::aarch64_stxp:
  12004. Info.opc = ISD::INTRINSIC_W_CHAIN;
  12005. Info.memVT = MVT::i128;
  12006. Info.ptrVal = I.getArgOperand(2);
  12007. Info.offset = 0;
  12008. Info.align = Align(16);
  12009. Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
  12010. return true;
  12011. case Intrinsic::aarch64_sve_ldnt1: {
  12012. Type *ElTy = cast<VectorType>(I.getType())->getElementType();
  12013. Info.opc = ISD::INTRINSIC_W_CHAIN;
  12014. Info.memVT = MVT::getVT(I.getType());
  12015. Info.ptrVal = I.getArgOperand(1);
  12016. Info.offset = 0;
  12017. Info.align = DL.getABITypeAlign(ElTy);
  12018. Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal;
  12019. return true;
  12020. }
  12021. case Intrinsic::aarch64_sve_stnt1: {
  12022. Type *ElTy =
  12023. cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
  12024. Info.opc = ISD::INTRINSIC_W_CHAIN;
  12025. Info.memVT = MVT::getVT(I.getOperand(0)->getType());
  12026. Info.ptrVal = I.getArgOperand(2);
  12027. Info.offset = 0;
  12028. Info.align = DL.getABITypeAlign(ElTy);
  12029. Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal;
  12030. return true;
  12031. }
  12032. case Intrinsic::aarch64_mops_memset_tag: {
  12033. Value *Dst = I.getArgOperand(0);
  12034. Value *Val = I.getArgOperand(1);
  12035. Info.opc = ISD::INTRINSIC_W_CHAIN;
  12036. Info.memVT = MVT::getVT(Val->getType());
  12037. Info.ptrVal = Dst;
  12038. Info.offset = 0;
  12039. Info.align = I.getParamAlign(0).valueOrOne();
  12040. Info.flags = MachineMemOperand::MOStore;
  12041. // The size of the memory being operated on is unknown at this point
  12042. Info.size = MemoryLocation::UnknownSize;
  12043. return true;
  12044. }
  12045. default:
  12046. break;
  12047. }
  12048. return false;
  12049. }
  12050. bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
  12051. ISD::LoadExtType ExtTy,
  12052. EVT NewVT) const {
  12053. // TODO: This may be worth removing. Check regression tests for diffs.
  12054. if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
  12055. return false;
  12056. // If we're reducing the load width in order to avoid having to use an extra
  12057. // instruction to do extension then it's probably a good idea.
  12058. if (ExtTy != ISD::NON_EXTLOAD)
  12059. return true;
  12060. // Don't reduce load width if it would prevent us from combining a shift into
  12061. // the offset.
  12062. MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
  12063. assert(Mem);
  12064. const SDValue &Base = Mem->getBasePtr();
  12065. if (Base.getOpcode() == ISD::ADD &&
  12066. Base.getOperand(1).getOpcode() == ISD::SHL &&
  12067. Base.getOperand(1).hasOneUse() &&
  12068. Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
  12069. // It's unknown whether a scalable vector has a power-of-2 bitwidth.
  12070. if (Mem->getMemoryVT().isScalableVector())
  12071. return false;
  12072. // The shift can be combined if it matches the size of the value being
  12073. // loaded (and so reducing the width would make it not match).
  12074. uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
  12075. uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
  12076. if (ShiftAmount == Log2_32(LoadBytes))
  12077. return false;
  12078. }
  12079. // We have no reason to disallow reducing the load width, so allow it.
  12080. return true;
  12081. }
  12082. // Truncations from 64-bit GPR to 32-bit GPR is free.
  12083. bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
  12084. if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
  12085. return false;
  12086. uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
  12087. uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
  12088. return NumBits1 > NumBits2;
  12089. }
  12090. bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
  12091. if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
  12092. return false;
  12093. uint64_t NumBits1 = VT1.getFixedSizeInBits();
  12094. uint64_t NumBits2 = VT2.getFixedSizeInBits();
  12095. return NumBits1 > NumBits2;
  12096. }
  12097. /// Check if it is profitable to hoist instruction in then/else to if.
  12098. /// Not profitable if I and it's user can form a FMA instruction
  12099. /// because we prefer FMSUB/FMADD.
  12100. bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
  12101. if (I->getOpcode() != Instruction::FMul)
  12102. return true;
  12103. if (!I->hasOneUse())
  12104. return true;
  12105. Instruction *User = I->user_back();
  12106. if (!(User->getOpcode() == Instruction::FSub ||
  12107. User->getOpcode() == Instruction::FAdd))
  12108. return true;
  12109. const TargetOptions &Options = getTargetMachine().Options;
  12110. const Function *F = I->getFunction();
  12111. const DataLayout &DL = F->getParent()->getDataLayout();
  12112. Type *Ty = User->getOperand(0)->getType();
  12113. return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
  12114. isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
  12115. (Options.AllowFPOpFusion == FPOpFusion::Fast ||
  12116. Options.UnsafeFPMath));
  12117. }
  12118. // All 32-bit GPR operations implicitly zero the high-half of the corresponding
  12119. // 64-bit GPR.
  12120. bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
  12121. if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
  12122. return false;
  12123. unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
  12124. unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
  12125. return NumBits1 == 32 && NumBits2 == 64;
  12126. }
  12127. bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
  12128. if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
  12129. return false;
  12130. unsigned NumBits1 = VT1.getSizeInBits();
  12131. unsigned NumBits2 = VT2.getSizeInBits();
  12132. return NumBits1 == 32 && NumBits2 == 64;
  12133. }
  12134. bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
  12135. EVT VT1 = Val.getValueType();
  12136. if (isZExtFree(VT1, VT2)) {
  12137. return true;
  12138. }
  12139. if (Val.getOpcode() != ISD::LOAD)
  12140. return false;
  12141. // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
  12142. return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
  12143. VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
  12144. VT1.getSizeInBits() <= 32);
  12145. }
  12146. bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
  12147. if (isa<FPExtInst>(Ext))
  12148. return false;
  12149. // Vector types are not free.
  12150. if (Ext->getType()->isVectorTy())
  12151. return false;
  12152. for (const Use &U : Ext->uses()) {
  12153. // The extension is free if we can fold it with a left shift in an
  12154. // addressing mode or an arithmetic operation: add, sub, and cmp.
  12155. // Is there a shift?
  12156. const Instruction *Instr = cast<Instruction>(U.getUser());
  12157. // Is this a constant shift?
  12158. switch (Instr->getOpcode()) {
  12159. case Instruction::Shl:
  12160. if (!isa<ConstantInt>(Instr->getOperand(1)))
  12161. return false;
  12162. break;
  12163. case Instruction::GetElementPtr: {
  12164. gep_type_iterator GTI = gep_type_begin(Instr);
  12165. auto &DL = Ext->getModule()->getDataLayout();
  12166. std::advance(GTI, U.getOperandNo()-1);
  12167. Type *IdxTy = GTI.getIndexedType();
  12168. // This extension will end up with a shift because of the scaling factor.
  12169. // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
  12170. // Get the shift amount based on the scaling factor:
  12171. // log2(sizeof(IdxTy)) - log2(8).
  12172. uint64_t ShiftAmt =
  12173. countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
  12174. 3;
  12175. // Is the constant foldable in the shift of the addressing mode?
  12176. // I.e., shift amount is between 1 and 4 inclusive.
  12177. if (ShiftAmt == 0 || ShiftAmt > 4)
  12178. return false;
  12179. break;
  12180. }
  12181. case Instruction::Trunc:
  12182. // Check if this is a noop.
  12183. // trunc(sext ty1 to ty2) to ty1.
  12184. if (Instr->getType() == Ext->getOperand(0)->getType())
  12185. continue;
  12186. [[fallthrough]];
  12187. default:
  12188. return false;
  12189. }
  12190. // At this point we can use the bfm family, so this extension is free
  12191. // for that use.
  12192. }
  12193. return true;
  12194. }
  12195. static bool isSplatShuffle(Value *V) {
  12196. if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
  12197. return all_equal(Shuf->getShuffleMask());
  12198. return false;
  12199. }
  12200. /// Check if both Op1 and Op2 are shufflevector extracts of either the lower
  12201. /// or upper half of the vector elements.
  12202. static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
  12203. bool AllowSplat = false) {
  12204. auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
  12205. auto *FullTy = FullV->getType();
  12206. auto *HalfTy = HalfV->getType();
  12207. return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
  12208. 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
  12209. };
  12210. auto extractHalf = [](Value *FullV, Value *HalfV) {
  12211. auto *FullVT = cast<FixedVectorType>(FullV->getType());
  12212. auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
  12213. return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
  12214. };
  12215. ArrayRef<int> M1, M2;
  12216. Value *S1Op1 = nullptr, *S2Op1 = nullptr;
  12217. if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
  12218. !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
  12219. return false;
  12220. // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
  12221. // it is not checked as an extract below.
  12222. if (AllowSplat && isSplatShuffle(Op1))
  12223. S1Op1 = nullptr;
  12224. if (AllowSplat && isSplatShuffle(Op2))
  12225. S2Op1 = nullptr;
  12226. // Check that the operands are half as wide as the result and we extract
  12227. // half of the elements of the input vectors.
  12228. if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
  12229. (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
  12230. return false;
  12231. // Check the mask extracts either the lower or upper half of vector
  12232. // elements.
  12233. int M1Start = 0;
  12234. int M2Start = 0;
  12235. int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
  12236. if ((S1Op1 &&
  12237. !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
  12238. (S2Op1 &&
  12239. !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
  12240. return false;
  12241. if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
  12242. (M2Start != 0 && M2Start != (NumElements / 2)))
  12243. return false;
  12244. if (S1Op1 && S2Op1 && M1Start != M2Start)
  12245. return false;
  12246. return true;
  12247. }
  12248. /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
  12249. /// of the vector elements.
  12250. static bool areExtractExts(Value *Ext1, Value *Ext2) {
  12251. auto areExtDoubled = [](Instruction *Ext) {
  12252. return Ext->getType()->getScalarSizeInBits() ==
  12253. 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
  12254. };
  12255. if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
  12256. !match(Ext2, m_ZExtOrSExt(m_Value())) ||
  12257. !areExtDoubled(cast<Instruction>(Ext1)) ||
  12258. !areExtDoubled(cast<Instruction>(Ext2)))
  12259. return false;
  12260. return true;
  12261. }
  12262. /// Check if Op could be used with vmull_high_p64 intrinsic.
  12263. static bool isOperandOfVmullHighP64(Value *Op) {
  12264. Value *VectorOperand = nullptr;
  12265. ConstantInt *ElementIndex = nullptr;
  12266. return match(Op, m_ExtractElt(m_Value(VectorOperand),
  12267. m_ConstantInt(ElementIndex))) &&
  12268. ElementIndex->getValue() == 1 &&
  12269. isa<FixedVectorType>(VectorOperand->getType()) &&
  12270. cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
  12271. }
  12272. /// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
  12273. static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
  12274. return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
  12275. }
  12276. /// Check if sinking \p I's operands to I's basic block is profitable, because
  12277. /// the operands can be folded into a target instruction, e.g.
  12278. /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
  12279. bool AArch64TargetLowering::shouldSinkOperands(
  12280. Instruction *I, SmallVectorImpl<Use *> &Ops) const {
  12281. if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
  12282. switch (II->getIntrinsicID()) {
  12283. case Intrinsic::aarch64_neon_smull:
  12284. case Intrinsic::aarch64_neon_umull:
  12285. if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
  12286. /*AllowSplat=*/true)) {
  12287. Ops.push_back(&II->getOperandUse(0));
  12288. Ops.push_back(&II->getOperandUse(1));
  12289. return true;
  12290. }
  12291. [[fallthrough]];
  12292. case Intrinsic::fma:
  12293. if (isa<VectorType>(I->getType()) &&
  12294. cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
  12295. !Subtarget->hasFullFP16())
  12296. return false;
  12297. [[fallthrough]];
  12298. case Intrinsic::aarch64_neon_sqdmull:
  12299. case Intrinsic::aarch64_neon_sqdmulh:
  12300. case Intrinsic::aarch64_neon_sqrdmulh:
  12301. // Sink splats for index lane variants
  12302. if (isSplatShuffle(II->getOperand(0)))
  12303. Ops.push_back(&II->getOperandUse(0));
  12304. if (isSplatShuffle(II->getOperand(1)))
  12305. Ops.push_back(&II->getOperandUse(1));
  12306. return !Ops.empty();
  12307. case Intrinsic::aarch64_sve_ptest_first:
  12308. case Intrinsic::aarch64_sve_ptest_last:
  12309. if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
  12310. if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
  12311. Ops.push_back(&II->getOperandUse(0));
  12312. return !Ops.empty();
  12313. case Intrinsic::aarch64_sme_write_horiz:
  12314. case Intrinsic::aarch64_sme_write_vert:
  12315. case Intrinsic::aarch64_sme_writeq_horiz:
  12316. case Intrinsic::aarch64_sme_writeq_vert: {
  12317. auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
  12318. if (!Idx || Idx->getOpcode() != Instruction::Add)
  12319. return false;
  12320. Ops.push_back(&II->getOperandUse(1));
  12321. return true;
  12322. }
  12323. case Intrinsic::aarch64_sme_read_horiz:
  12324. case Intrinsic::aarch64_sme_read_vert:
  12325. case Intrinsic::aarch64_sme_readq_horiz:
  12326. case Intrinsic::aarch64_sme_readq_vert:
  12327. case Intrinsic::aarch64_sme_ld1b_vert:
  12328. case Intrinsic::aarch64_sme_ld1h_vert:
  12329. case Intrinsic::aarch64_sme_ld1w_vert:
  12330. case Intrinsic::aarch64_sme_ld1d_vert:
  12331. case Intrinsic::aarch64_sme_ld1q_vert:
  12332. case Intrinsic::aarch64_sme_st1b_vert:
  12333. case Intrinsic::aarch64_sme_st1h_vert:
  12334. case Intrinsic::aarch64_sme_st1w_vert:
  12335. case Intrinsic::aarch64_sme_st1d_vert:
  12336. case Intrinsic::aarch64_sme_st1q_vert:
  12337. case Intrinsic::aarch64_sme_ld1b_horiz:
  12338. case Intrinsic::aarch64_sme_ld1h_horiz:
  12339. case Intrinsic::aarch64_sme_ld1w_horiz:
  12340. case Intrinsic::aarch64_sme_ld1d_horiz:
  12341. case Intrinsic::aarch64_sme_ld1q_horiz:
  12342. case Intrinsic::aarch64_sme_st1b_horiz:
  12343. case Intrinsic::aarch64_sme_st1h_horiz:
  12344. case Intrinsic::aarch64_sme_st1w_horiz:
  12345. case Intrinsic::aarch64_sme_st1d_horiz:
  12346. case Intrinsic::aarch64_sme_st1q_horiz: {
  12347. auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
  12348. if (!Idx || Idx->getOpcode() != Instruction::Add)
  12349. return false;
  12350. Ops.push_back(&II->getOperandUse(3));
  12351. return true;
  12352. }
  12353. case Intrinsic::aarch64_neon_pmull:
  12354. if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
  12355. return false;
  12356. Ops.push_back(&II->getOperandUse(0));
  12357. Ops.push_back(&II->getOperandUse(1));
  12358. return true;
  12359. case Intrinsic::aarch64_neon_pmull64:
  12360. if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
  12361. II->getArgOperand(1)))
  12362. return false;
  12363. Ops.push_back(&II->getArgOperandUse(0));
  12364. Ops.push_back(&II->getArgOperandUse(1));
  12365. return true;
  12366. default:
  12367. return false;
  12368. }
  12369. }
  12370. if (!I->getType()->isVectorTy())
  12371. return false;
  12372. switch (I->getOpcode()) {
  12373. case Instruction::Sub:
  12374. case Instruction::Add: {
  12375. if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
  12376. return false;
  12377. // If the exts' operands extract either the lower or upper elements, we
  12378. // can sink them too.
  12379. auto Ext1 = cast<Instruction>(I->getOperand(0));
  12380. auto Ext2 = cast<Instruction>(I->getOperand(1));
  12381. if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
  12382. Ops.push_back(&Ext1->getOperandUse(0));
  12383. Ops.push_back(&Ext2->getOperandUse(0));
  12384. }
  12385. Ops.push_back(&I->getOperandUse(0));
  12386. Ops.push_back(&I->getOperandUse(1));
  12387. return true;
  12388. }
  12389. case Instruction::Mul: {
  12390. int NumZExts = 0, NumSExts = 0;
  12391. for (auto &Op : I->operands()) {
  12392. // Make sure we are not already sinking this operand
  12393. if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
  12394. continue;
  12395. if (match(&Op, m_SExt(m_Value()))) {
  12396. NumSExts++;
  12397. continue;
  12398. } else if (match(&Op, m_ZExt(m_Value()))) {
  12399. NumZExts++;
  12400. continue;
  12401. }
  12402. ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
  12403. // If the Shuffle is a splat and the operand is a zext/sext, sinking the
  12404. // operand and the s/zext can help create indexed s/umull. This is
  12405. // especially useful to prevent i64 mul being scalarized.
  12406. if (Shuffle && isSplatShuffle(Shuffle) &&
  12407. match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
  12408. Ops.push_back(&Shuffle->getOperandUse(0));
  12409. Ops.push_back(&Op);
  12410. if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
  12411. NumSExts++;
  12412. else
  12413. NumZExts++;
  12414. continue;
  12415. }
  12416. if (!Shuffle)
  12417. continue;
  12418. Value *ShuffleOperand = Shuffle->getOperand(0);
  12419. InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
  12420. if (!Insert)
  12421. continue;
  12422. Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
  12423. if (!OperandInstr)
  12424. continue;
  12425. ConstantInt *ElementConstant =
  12426. dyn_cast<ConstantInt>(Insert->getOperand(2));
  12427. // Check that the insertelement is inserting into element 0
  12428. if (!ElementConstant || ElementConstant->getZExtValue() != 0)
  12429. continue;
  12430. unsigned Opcode = OperandInstr->getOpcode();
  12431. if (Opcode == Instruction::SExt)
  12432. NumSExts++;
  12433. else if (Opcode == Instruction::ZExt)
  12434. NumZExts++;
  12435. else {
  12436. // If we find that the top bits are known 0, then we can sink and allow
  12437. // the backend to generate a umull.
  12438. unsigned Bitwidth = I->getType()->getScalarSizeInBits();
  12439. APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
  12440. const DataLayout &DL = I->getFunction()->getParent()->getDataLayout();
  12441. if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
  12442. continue;
  12443. NumZExts++;
  12444. }
  12445. Ops.push_back(&Shuffle->getOperandUse(0));
  12446. Ops.push_back(&Op);
  12447. }
  12448. // Is it profitable to sink if we found two of the same type of extends.
  12449. return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
  12450. }
  12451. default:
  12452. return false;
  12453. }
  12454. return false;
  12455. }
  12456. static void createTblShuffleForZExt(ZExtInst *ZExt, bool IsLittleEndian) {
  12457. Value *Op = ZExt->getOperand(0);
  12458. auto *SrcTy = cast<FixedVectorType>(Op->getType());
  12459. auto *DstTy = cast<FixedVectorType>(ZExt->getType());
  12460. auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
  12461. auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
  12462. assert(DstWidth % SrcWidth == 0 &&
  12463. "TBL lowering is not supported for a ZExt instruction with this "
  12464. "source & destination element type.");
  12465. unsigned ZExtFactor = DstWidth / SrcWidth;
  12466. unsigned NumElts = SrcTy->getNumElements();
  12467. IRBuilder<> Builder(ZExt);
  12468. SmallVector<int> Mask;
  12469. // Create a mask that selects <0,...,Op[i]> for each lane of the destination
  12470. // vector to replace the original ZExt. This can later be lowered to a set of
  12471. // tbl instructions.
  12472. for (unsigned i = 0; i < NumElts * ZExtFactor; i++) {
  12473. if (IsLittleEndian) {
  12474. if (i % ZExtFactor == 0)
  12475. Mask.push_back(i / ZExtFactor);
  12476. else
  12477. Mask.push_back(NumElts);
  12478. } else {
  12479. if ((i + 1) % ZExtFactor == 0)
  12480. Mask.push_back((i - ZExtFactor + 1) / ZExtFactor);
  12481. else
  12482. Mask.push_back(NumElts);
  12483. }
  12484. }
  12485. auto *FirstEltZero = Builder.CreateInsertElement(
  12486. PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
  12487. Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
  12488. Result = Builder.CreateBitCast(Result, DstTy);
  12489. ZExt->replaceAllUsesWith(Result);
  12490. ZExt->eraseFromParent();
  12491. }
  12492. static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
  12493. IRBuilder<> Builder(TI);
  12494. SmallVector<Value *> Parts;
  12495. int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
  12496. auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
  12497. auto *DstTy = cast<FixedVectorType>(TI->getType());
  12498. assert(SrcTy->getElementType()->isIntegerTy() &&
  12499. "Non-integer type source vector element is not supported");
  12500. assert(DstTy->getElementType()->isIntegerTy(8) &&
  12501. "Unsupported destination vector element type");
  12502. unsigned SrcElemTySz =
  12503. cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
  12504. unsigned DstElemTySz =
  12505. cast<IntegerType>(DstTy->getElementType())->getBitWidth();
  12506. assert((SrcElemTySz % DstElemTySz == 0) &&
  12507. "Cannot lower truncate to tbl instructions for a source element size "
  12508. "that is not divisible by the destination element size");
  12509. unsigned TruncFactor = SrcElemTySz / DstElemTySz;
  12510. assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
  12511. "Unsupported source vector element type size");
  12512. Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
  12513. // Create a mask to choose every nth byte from the source vector table of
  12514. // bytes to create the truncated destination vector, where 'n' is the truncate
  12515. // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
  12516. // 0,8,16,..Y*8th bytes for the little-endian format
  12517. SmallVector<Constant *, 16> MaskConst;
  12518. for (int Itr = 0; Itr < 16; Itr++) {
  12519. if (Itr < NumElements)
  12520. MaskConst.push_back(Builder.getInt8(
  12521. IsLittleEndian ? Itr * TruncFactor
  12522. : Itr * TruncFactor + (TruncFactor - 1)));
  12523. else
  12524. MaskConst.push_back(Builder.getInt8(255));
  12525. }
  12526. int MaxTblSz = 128 * 4;
  12527. int MaxSrcSz = SrcElemTySz * NumElements;
  12528. int ElemsPerTbl =
  12529. (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
  12530. assert(ElemsPerTbl <= 16 &&
  12531. "Maximum elements selected using TBL instruction cannot exceed 16!");
  12532. int ShuffleCount = 128 / SrcElemTySz;
  12533. SmallVector<int> ShuffleLanes;
  12534. for (int i = 0; i < ShuffleCount; ++i)
  12535. ShuffleLanes.push_back(i);
  12536. // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
  12537. // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
  12538. // call TBL & save the result in a vector of TBL results for combining later.
  12539. SmallVector<Value *> Results;
  12540. while (ShuffleLanes.back() < NumElements) {
  12541. Parts.push_back(Builder.CreateBitCast(
  12542. Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
  12543. if (Parts.size() == 4) {
  12544. auto *F = Intrinsic::getDeclaration(TI->getModule(),
  12545. Intrinsic::aarch64_neon_tbl4, VecTy);
  12546. Parts.push_back(ConstantVector::get(MaskConst));
  12547. Results.push_back(Builder.CreateCall(F, Parts));
  12548. Parts.clear();
  12549. }
  12550. for (int i = 0; i < ShuffleCount; ++i)
  12551. ShuffleLanes[i] += ShuffleCount;
  12552. }
  12553. assert((Parts.empty() || Results.empty()) &&
  12554. "Lowering trunc for vectors requiring different TBL instructions is "
  12555. "not supported!");
  12556. // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
  12557. // registers
  12558. if (!Parts.empty()) {
  12559. Intrinsic::ID TblID;
  12560. switch (Parts.size()) {
  12561. case 1:
  12562. TblID = Intrinsic::aarch64_neon_tbl1;
  12563. break;
  12564. case 2:
  12565. TblID = Intrinsic::aarch64_neon_tbl2;
  12566. break;
  12567. case 3:
  12568. TblID = Intrinsic::aarch64_neon_tbl3;
  12569. break;
  12570. }
  12571. auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy);
  12572. Parts.push_back(ConstantVector::get(MaskConst));
  12573. Results.push_back(Builder.CreateCall(F, Parts));
  12574. }
  12575. // Extract the destination vector from TBL result(s) after combining them
  12576. // where applicable. Currently, at most two TBLs are supported.
  12577. assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
  12578. "more than 2 tbl instructions!");
  12579. Value *FinalResult = Results[0];
  12580. if (Results.size() == 1) {
  12581. if (ElemsPerTbl < 16) {
  12582. SmallVector<int> FinalMask(ElemsPerTbl);
  12583. std::iota(FinalMask.begin(), FinalMask.end(), 0);
  12584. FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
  12585. }
  12586. } else {
  12587. SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
  12588. if (ElemsPerTbl < 16) {
  12589. std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
  12590. std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
  12591. } else {
  12592. std::iota(FinalMask.begin(), FinalMask.end(), 0);
  12593. }
  12594. FinalResult =
  12595. Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
  12596. }
  12597. TI->replaceAllUsesWith(FinalResult);
  12598. TI->eraseFromParent();
  12599. }
  12600. bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I,
  12601. Loop *L) const {
  12602. // shuffle_vector instructions are serialized when targeting SVE,
  12603. // see LowerSPLAT_VECTOR. This peephole is not beneficial.
  12604. if (Subtarget->useSVEForFixedLengthVectors())
  12605. return false;
  12606. // Try to optimize conversions using tbl. This requires materializing constant
  12607. // index vectors, which can increase code size and add loads. Skip the
  12608. // transform unless the conversion is in a loop block guaranteed to execute
  12609. // and we are not optimizing for size.
  12610. Function *F = I->getParent()->getParent();
  12611. if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
  12612. F->hasOptSize())
  12613. return false;
  12614. auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
  12615. auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
  12616. if (!SrcTy || !DstTy)
  12617. return false;
  12618. // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
  12619. // lowered to tbl instructions to insert the original i8 elements
  12620. // into i8x lanes. This is enabled for cases where it is beneficial.
  12621. auto *ZExt = dyn_cast<ZExtInst>(I);
  12622. if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
  12623. auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
  12624. if (DstWidth % 8 == 0 && DstWidth > 16 && DstWidth < 64) {
  12625. createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian());
  12626. return true;
  12627. }
  12628. }
  12629. auto *UIToFP = dyn_cast<UIToFPInst>(I);
  12630. if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
  12631. DstTy->getElementType()->isFloatTy()) {
  12632. IRBuilder<> Builder(I);
  12633. auto *ZExt = cast<ZExtInst>(
  12634. Builder.CreateZExt(I->getOperand(0), VectorType::getInteger(DstTy)));
  12635. auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
  12636. I->replaceAllUsesWith(UI);
  12637. I->eraseFromParent();
  12638. createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian());
  12639. return true;
  12640. }
  12641. // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
  12642. // followed by a truncate lowered to using tbl.4.
  12643. auto *FPToUI = dyn_cast<FPToUIInst>(I);
  12644. if (FPToUI &&
  12645. (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
  12646. SrcTy->getElementType()->isFloatTy() &&
  12647. DstTy->getElementType()->isIntegerTy(8)) {
  12648. IRBuilder<> Builder(I);
  12649. auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
  12650. VectorType::getInteger(SrcTy));
  12651. auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
  12652. I->replaceAllUsesWith(TruncI);
  12653. I->eraseFromParent();
  12654. createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
  12655. return true;
  12656. }
  12657. // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
  12658. // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
  12659. // per lane of the input that is represented using 1,2,3 or 4 128-bit table
  12660. // registers
  12661. auto *TI = dyn_cast<TruncInst>(I);
  12662. if (TI && DstTy->getElementType()->isIntegerTy(8) &&
  12663. ((SrcTy->getElementType()->isIntegerTy(32) ||
  12664. SrcTy->getElementType()->isIntegerTy(64)) &&
  12665. (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
  12666. createTblForTrunc(TI, Subtarget->isLittleEndian());
  12667. return true;
  12668. }
  12669. return false;
  12670. }
  12671. bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
  12672. Align &RequiredAligment) const {
  12673. if (!LoadedType.isSimple() ||
  12674. (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
  12675. return false;
  12676. // Cyclone supports unaligned accesses.
  12677. RequiredAligment = Align(1);
  12678. unsigned NumBits = LoadedType.getSizeInBits();
  12679. return NumBits == 32 || NumBits == 64;
  12680. }
  12681. /// A helper function for determining the number of interleaved accesses we
  12682. /// will generate when lowering accesses of the given type.
  12683. unsigned AArch64TargetLowering::getNumInterleavedAccesses(
  12684. VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
  12685. unsigned VecSize = 128;
  12686. if (UseScalable)
  12687. VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
  12688. return std::max<unsigned>(1, (DL.getTypeSizeInBits(VecTy) + 127) / VecSize);
  12689. }
  12690. MachineMemOperand::Flags
  12691. AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
  12692. if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
  12693. I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
  12694. return MOStridedAccess;
  12695. return MachineMemOperand::MONone;
  12696. }
  12697. bool AArch64TargetLowering::isLegalInterleavedAccessType(
  12698. VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
  12699. unsigned VecSize = DL.getTypeSizeInBits(VecTy);
  12700. unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
  12701. unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
  12702. UseScalable = false;
  12703. // Ensure that the predicate for this number of elements is available.
  12704. if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(NumElements))
  12705. return false;
  12706. // Ensure the number of vector elements is greater than 1.
  12707. if (NumElements < 2)
  12708. return false;
  12709. // Ensure the element type is legal.
  12710. if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
  12711. return false;
  12712. if (Subtarget->forceStreamingCompatibleSVE() ||
  12713. (Subtarget->useSVEForFixedLengthVectors() &&
  12714. (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 ||
  12715. (VecSize < Subtarget->getMinSVEVectorSizeInBits() &&
  12716. isPowerOf2_32(NumElements) && VecSize > 128)))) {
  12717. UseScalable = true;
  12718. return true;
  12719. }
  12720. // Ensure the total vector size is 64 or a multiple of 128. Types larger than
  12721. // 128 will be split into multiple interleaved accesses.
  12722. return VecSize == 64 || VecSize % 128 == 0;
  12723. }
  12724. static ScalableVectorType *getSVEContainerIRType(FixedVectorType *VTy) {
  12725. if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
  12726. return ScalableVectorType::get(VTy->getElementType(), 2);
  12727. if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
  12728. return ScalableVectorType::get(VTy->getElementType(), 4);
  12729. if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
  12730. return ScalableVectorType::get(VTy->getElementType(), 8);
  12731. if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
  12732. return ScalableVectorType::get(VTy->getElementType(), 8);
  12733. if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
  12734. return ScalableVectorType::get(VTy->getElementType(), 2);
  12735. if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
  12736. return ScalableVectorType::get(VTy->getElementType(), 4);
  12737. if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
  12738. return ScalableVectorType::get(VTy->getElementType(), 8);
  12739. if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
  12740. return ScalableVectorType::get(VTy->getElementType(), 16);
  12741. llvm_unreachable("Cannot handle input vector type");
  12742. }
  12743. /// Lower an interleaved load into a ldN intrinsic.
  12744. ///
  12745. /// E.g. Lower an interleaved load (Factor = 2):
  12746. /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
  12747. /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
  12748. /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
  12749. ///
  12750. /// Into:
  12751. /// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
  12752. /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
  12753. /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
  12754. bool AArch64TargetLowering::lowerInterleavedLoad(
  12755. LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
  12756. ArrayRef<unsigned> Indices, unsigned Factor) const {
  12757. assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
  12758. "Invalid interleave factor");
  12759. assert(!Shuffles.empty() && "Empty shufflevector input");
  12760. assert(Shuffles.size() == Indices.size() &&
  12761. "Unmatched number of shufflevectors and indices");
  12762. const DataLayout &DL = LI->getModule()->getDataLayout();
  12763. VectorType *VTy = Shuffles[0]->getType();
  12764. // Skip if we do not have NEON and skip illegal vector types. We can
  12765. // "legalize" wide vector types into multiple interleaved accesses as long as
  12766. // the vector types are divisible by 128.
  12767. bool UseScalable;
  12768. if (!Subtarget->hasNEON() ||
  12769. !isLegalInterleavedAccessType(VTy, DL, UseScalable))
  12770. return false;
  12771. unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
  12772. auto *FVTy = cast<FixedVectorType>(VTy);
  12773. // A pointer vector can not be the return type of the ldN intrinsics. Need to
  12774. // load integer vectors first and then convert to pointer vectors.
  12775. Type *EltTy = FVTy->getElementType();
  12776. if (EltTy->isPointerTy())
  12777. FVTy =
  12778. FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
  12779. // If we're going to generate more than one load, reset the sub-vector type
  12780. // to something legal.
  12781. FVTy = FixedVectorType::get(FVTy->getElementType(),
  12782. FVTy->getNumElements() / NumLoads);
  12783. auto *LDVTy =
  12784. UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
  12785. IRBuilder<> Builder(LI);
  12786. // The base address of the load.
  12787. Value *BaseAddr = LI->getPointerOperand();
  12788. if (NumLoads > 1) {
  12789. // We will compute the pointer operand of each load from the original base
  12790. // address using GEPs. Cast the base address to a pointer to the scalar
  12791. // element type.
  12792. BaseAddr = Builder.CreateBitCast(
  12793. BaseAddr,
  12794. LDVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
  12795. }
  12796. Type *PtrTy =
  12797. UseScalable
  12798. ? LDVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())
  12799. : LDVTy->getPointerTo(LI->getPointerAddressSpace());
  12800. Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
  12801. LDVTy->getElementCount());
  12802. static const Intrinsic::ID SVELoadIntrs[3] = {
  12803. Intrinsic::aarch64_sve_ld2_sret, Intrinsic::aarch64_sve_ld3_sret,
  12804. Intrinsic::aarch64_sve_ld4_sret};
  12805. static const Intrinsic::ID NEONLoadIntrs[3] = {Intrinsic::aarch64_neon_ld2,
  12806. Intrinsic::aarch64_neon_ld3,
  12807. Intrinsic::aarch64_neon_ld4};
  12808. Function *LdNFunc;
  12809. if (UseScalable)
  12810. LdNFunc = Intrinsic::getDeclaration(LI->getModule(),
  12811. SVELoadIntrs[Factor - 2], {LDVTy});
  12812. else
  12813. LdNFunc = Intrinsic::getDeclaration(
  12814. LI->getModule(), NEONLoadIntrs[Factor - 2], {LDVTy, PtrTy});
  12815. // Holds sub-vectors extracted from the load intrinsic return values. The
  12816. // sub-vectors are associated with the shufflevector instructions they will
  12817. // replace.
  12818. DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
  12819. Value *PTrue = nullptr;
  12820. if (UseScalable) {
  12821. std::optional<unsigned> PgPattern =
  12822. getSVEPredPatternFromNumElements(FVTy->getNumElements());
  12823. if (Subtarget->getMinSVEVectorSizeInBits() ==
  12824. Subtarget->getMaxSVEVectorSizeInBits() &&
  12825. Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
  12826. PgPattern = AArch64SVEPredPattern::all;
  12827. auto *PTruePat =
  12828. ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
  12829. PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
  12830. {PTruePat});
  12831. }
  12832. for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
  12833. // If we're generating more than one load, compute the base address of
  12834. // subsequent loads as an offset from the previous.
  12835. if (LoadCount > 0)
  12836. BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
  12837. FVTy->getNumElements() * Factor);
  12838. CallInst *LdN;
  12839. if (UseScalable)
  12840. LdN = Builder.CreateCall(
  12841. LdNFunc, {PTrue, Builder.CreateBitCast(BaseAddr, PtrTy)}, "ldN");
  12842. else
  12843. LdN = Builder.CreateCall(LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy),
  12844. "ldN");
  12845. // Extract and store the sub-vectors returned by the load intrinsic.
  12846. for (unsigned i = 0; i < Shuffles.size(); i++) {
  12847. ShuffleVectorInst *SVI = Shuffles[i];
  12848. unsigned Index = Indices[i];
  12849. Value *SubVec = Builder.CreateExtractValue(LdN, Index);
  12850. if (UseScalable)
  12851. SubVec = Builder.CreateExtractVector(
  12852. FVTy, SubVec,
  12853. ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0));
  12854. // Convert the integer vector to pointer vector if the element is pointer.
  12855. if (EltTy->isPointerTy())
  12856. SubVec = Builder.CreateIntToPtr(
  12857. SubVec, FixedVectorType::get(SVI->getType()->getElementType(),
  12858. FVTy->getNumElements()));
  12859. SubVecs[SVI].push_back(SubVec);
  12860. }
  12861. }
  12862. // Replace uses of the shufflevector instructions with the sub-vectors
  12863. // returned by the load intrinsic. If a shufflevector instruction is
  12864. // associated with more than one sub-vector, those sub-vectors will be
  12865. // concatenated into a single wide vector.
  12866. for (ShuffleVectorInst *SVI : Shuffles) {
  12867. auto &SubVec = SubVecs[SVI];
  12868. auto *WideVec =
  12869. SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
  12870. SVI->replaceAllUsesWith(WideVec);
  12871. }
  12872. return true;
  12873. }
  12874. /// Lower an interleaved store into a stN intrinsic.
  12875. ///
  12876. /// E.g. Lower an interleaved store (Factor = 3):
  12877. /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
  12878. /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
  12879. /// store <12 x i32> %i.vec, <12 x i32>* %ptr
  12880. ///
  12881. /// Into:
  12882. /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
  12883. /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
  12884. /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
  12885. /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
  12886. ///
  12887. /// Note that the new shufflevectors will be removed and we'll only generate one
  12888. /// st3 instruction in CodeGen.
  12889. ///
  12890. /// Example for a more general valid mask (Factor 3). Lower:
  12891. /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
  12892. /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
  12893. /// store <12 x i32> %i.vec, <12 x i32>* %ptr
  12894. ///
  12895. /// Into:
  12896. /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
  12897. /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
  12898. /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
  12899. /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
  12900. bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
  12901. ShuffleVectorInst *SVI,
  12902. unsigned Factor) const {
  12903. // Skip if streaming compatible SVE is enabled, because it generates invalid
  12904. // code in streaming mode when SVE length is not specified.
  12905. if (Subtarget->forceStreamingCompatibleSVE())
  12906. return false;
  12907. assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
  12908. "Invalid interleave factor");
  12909. auto *VecTy = cast<FixedVectorType>(SVI->getType());
  12910. assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
  12911. unsigned LaneLen = VecTy->getNumElements() / Factor;
  12912. Type *EltTy = VecTy->getElementType();
  12913. auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
  12914. const DataLayout &DL = SI->getModule()->getDataLayout();
  12915. bool UseScalable;
  12916. // Skip if we do not have NEON and skip illegal vector types. We can
  12917. // "legalize" wide vector types into multiple interleaved accesses as long as
  12918. // the vector types are divisible by 128.
  12919. if (!Subtarget->hasNEON() ||
  12920. !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
  12921. return false;
  12922. unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
  12923. Value *Op0 = SVI->getOperand(0);
  12924. Value *Op1 = SVI->getOperand(1);
  12925. IRBuilder<> Builder(SI);
  12926. // StN intrinsics don't support pointer vectors as arguments. Convert pointer
  12927. // vectors to integer vectors.
  12928. if (EltTy->isPointerTy()) {
  12929. Type *IntTy = DL.getIntPtrType(EltTy);
  12930. unsigned NumOpElts =
  12931. cast<FixedVectorType>(Op0->getType())->getNumElements();
  12932. // Convert to the corresponding integer vector.
  12933. auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
  12934. Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
  12935. Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
  12936. SubVecTy = FixedVectorType::get(IntTy, LaneLen);
  12937. }
  12938. // If we're going to generate more than one store, reset the lane length
  12939. // and sub-vector type to something legal.
  12940. LaneLen /= NumStores;
  12941. SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
  12942. auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
  12943. : SubVecTy;
  12944. // The base address of the store.
  12945. Value *BaseAddr = SI->getPointerOperand();
  12946. if (NumStores > 1) {
  12947. // We will compute the pointer operand of each store from the original base
  12948. // address using GEPs. Cast the base address to a pointer to the scalar
  12949. // element type.
  12950. BaseAddr = Builder.CreateBitCast(
  12951. BaseAddr,
  12952. SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
  12953. }
  12954. auto Mask = SVI->getShuffleMask();
  12955. // Sanity check if all the indices are NOT in range.
  12956. // If mask is `undef` or `poison`, `Mask` may be a vector of -1s.
  12957. // If all of them are `undef`, OOB read will happen later.
  12958. if (llvm::all_of(Mask, [](int Idx) { return Idx == UndefMaskElem; })) {
  12959. return false;
  12960. }
  12961. Type *PtrTy =
  12962. UseScalable
  12963. ? STVTy->getElementType()->getPointerTo(SI->getPointerAddressSpace())
  12964. : STVTy->getPointerTo(SI->getPointerAddressSpace());
  12965. Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
  12966. STVTy->getElementCount());
  12967. static const Intrinsic::ID SVEStoreIntrs[3] = {Intrinsic::aarch64_sve_st2,
  12968. Intrinsic::aarch64_sve_st3,
  12969. Intrinsic::aarch64_sve_st4};
  12970. static const Intrinsic::ID NEONStoreIntrs[3] = {Intrinsic::aarch64_neon_st2,
  12971. Intrinsic::aarch64_neon_st3,
  12972. Intrinsic::aarch64_neon_st4};
  12973. Function *StNFunc;
  12974. if (UseScalable)
  12975. StNFunc = Intrinsic::getDeclaration(SI->getModule(),
  12976. SVEStoreIntrs[Factor - 2], {STVTy});
  12977. else
  12978. StNFunc = Intrinsic::getDeclaration(
  12979. SI->getModule(), NEONStoreIntrs[Factor - 2], {STVTy, PtrTy});
  12980. Value *PTrue = nullptr;
  12981. if (UseScalable) {
  12982. std::optional<unsigned> PgPattern =
  12983. getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
  12984. if (Subtarget->getMinSVEVectorSizeInBits() ==
  12985. Subtarget->getMaxSVEVectorSizeInBits() &&
  12986. Subtarget->getMinSVEVectorSizeInBits() ==
  12987. DL.getTypeSizeInBits(SubVecTy))
  12988. PgPattern = AArch64SVEPredPattern::all;
  12989. auto *PTruePat =
  12990. ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
  12991. PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
  12992. {PTruePat});
  12993. }
  12994. for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
  12995. SmallVector<Value *, 5> Ops;
  12996. // Split the shufflevector operands into sub vectors for the new stN call.
  12997. for (unsigned i = 0; i < Factor; i++) {
  12998. Value *Shuffle;
  12999. unsigned IdxI = StoreCount * LaneLen * Factor + i;
  13000. if (Mask[IdxI] >= 0) {
  13001. Shuffle = Builder.CreateShuffleVector(
  13002. Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
  13003. } else {
  13004. unsigned StartMask = 0;
  13005. for (unsigned j = 1; j < LaneLen; j++) {
  13006. unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
  13007. if (Mask[IdxJ] >= 0) {
  13008. StartMask = Mask[IdxJ] - j;
  13009. break;
  13010. }
  13011. }
  13012. // Note: Filling undef gaps with random elements is ok, since
  13013. // those elements were being written anyway (with undefs).
  13014. // In the case of all undefs we're defaulting to using elems from 0
  13015. // Note: StartMask cannot be negative, it's checked in
  13016. // isReInterleaveMask
  13017. Shuffle = Builder.CreateShuffleVector(
  13018. Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
  13019. }
  13020. if (UseScalable)
  13021. Shuffle = Builder.CreateInsertVector(
  13022. STVTy, UndefValue::get(STVTy), Shuffle,
  13023. ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
  13024. Ops.push_back(Shuffle);
  13025. }
  13026. if (UseScalable)
  13027. Ops.push_back(PTrue);
  13028. // If we generating more than one store, we compute the base address of
  13029. // subsequent stores as an offset from the previous.
  13030. if (StoreCount > 0)
  13031. BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
  13032. BaseAddr, LaneLen * Factor);
  13033. Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
  13034. Builder.CreateCall(StNFunc, Ops);
  13035. }
  13036. return true;
  13037. }
  13038. EVT AArch64TargetLowering::getOptimalMemOpType(
  13039. const MemOp &Op, const AttributeList &FuncAttributes) const {
  13040. bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
  13041. bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
  13042. bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
  13043. // Only use AdvSIMD to implement memset of 32-byte and above. It would have
  13044. // taken one instruction to materialize the v2i64 zero and one store (with
  13045. // restrictive addressing mode). Just do i64 stores.
  13046. bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
  13047. auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
  13048. if (Op.isAligned(AlignCheck))
  13049. return true;
  13050. unsigned Fast;
  13051. return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
  13052. MachineMemOperand::MONone, &Fast) &&
  13053. Fast;
  13054. };
  13055. if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
  13056. AlignmentIsAcceptable(MVT::v16i8, Align(16)))
  13057. return MVT::v16i8;
  13058. if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
  13059. return MVT::f128;
  13060. if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
  13061. return MVT::i64;
  13062. if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
  13063. return MVT::i32;
  13064. return MVT::Other;
  13065. }
  13066. LLT AArch64TargetLowering::getOptimalMemOpLLT(
  13067. const MemOp &Op, const AttributeList &FuncAttributes) const {
  13068. bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
  13069. bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
  13070. bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
  13071. // Only use AdvSIMD to implement memset of 32-byte and above. It would have
  13072. // taken one instruction to materialize the v2i64 zero and one store (with
  13073. // restrictive addressing mode). Just do i64 stores.
  13074. bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
  13075. auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
  13076. if (Op.isAligned(AlignCheck))
  13077. return true;
  13078. unsigned Fast;
  13079. return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
  13080. MachineMemOperand::MONone, &Fast) &&
  13081. Fast;
  13082. };
  13083. if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
  13084. AlignmentIsAcceptable(MVT::v2i64, Align(16)))
  13085. return LLT::fixed_vector(2, 64);
  13086. if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
  13087. return LLT::scalar(128);
  13088. if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
  13089. return LLT::scalar(64);
  13090. if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
  13091. return LLT::scalar(32);
  13092. return LLT();
  13093. }
  13094. // 12-bit optionally shifted immediates are legal for adds.
  13095. bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
  13096. if (Immed == std::numeric_limits<int64_t>::min()) {
  13097. LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
  13098. << ": avoid UB for INT64_MIN\n");
  13099. return false;
  13100. }
  13101. // Same encoding for add/sub, just flip the sign.
  13102. Immed = std::abs(Immed);
  13103. bool IsLegal = ((Immed >> 12) == 0 ||
  13104. ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
  13105. LLVM_DEBUG(dbgs() << "Is " << Immed
  13106. << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
  13107. return IsLegal;
  13108. }
  13109. // Return false to prevent folding
  13110. // (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
  13111. // if the folding leads to worse code.
  13112. bool AArch64TargetLowering::isMulAddWithConstProfitable(
  13113. SDValue AddNode, SDValue ConstNode) const {
  13114. // Let the DAGCombiner decide for vector types and large types.
  13115. const EVT VT = AddNode.getValueType();
  13116. if (VT.isVector() || VT.getScalarSizeInBits() > 64)
  13117. return true;
  13118. // It is worse if c1 is legal add immediate, while c1*c2 is not
  13119. // and has to be composed by at least two instructions.
  13120. const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
  13121. const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
  13122. const int64_t C1 = C1Node->getSExtValue();
  13123. const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
  13124. if (!isLegalAddImmediate(C1) || isLegalAddImmediate(C1C2.getSExtValue()))
  13125. return true;
  13126. SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
  13127. AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), VT.getSizeInBits(), Insn);
  13128. if (Insn.size() > 1)
  13129. return false;
  13130. // Default to true and let the DAGCombiner decide.
  13131. return true;
  13132. }
  13133. // Integer comparisons are implemented with ADDS/SUBS, so the range of valid
  13134. // immediates is the same as for an add or a sub.
  13135. bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
  13136. return isLegalAddImmediate(Immed);
  13137. }
  13138. /// isLegalAddressingMode - Return true if the addressing mode represented
  13139. /// by AM is legal for this target, for a load/store of the specified type.
  13140. bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
  13141. const AddrMode &AM, Type *Ty,
  13142. unsigned AS, Instruction *I) const {
  13143. // AArch64 has five basic addressing modes:
  13144. // reg
  13145. // reg + 9-bit signed offset
  13146. // reg + SIZE_IN_BYTES * 12-bit unsigned offset
  13147. // reg1 + reg2
  13148. // reg + SIZE_IN_BYTES * reg
  13149. // No global is ever allowed as a base.
  13150. if (AM.BaseGV)
  13151. return false;
  13152. // No reg+reg+imm addressing.
  13153. if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
  13154. return false;
  13155. // FIXME: Update this method to support scalable addressing modes.
  13156. if (isa<ScalableVectorType>(Ty)) {
  13157. uint64_t VecElemNumBytes =
  13158. DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
  13159. return AM.HasBaseReg && !AM.BaseOffs &&
  13160. (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
  13161. }
  13162. // check reg + imm case:
  13163. // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
  13164. uint64_t NumBytes = 0;
  13165. if (Ty->isSized()) {
  13166. uint64_t NumBits = DL.getTypeSizeInBits(Ty);
  13167. NumBytes = NumBits / 8;
  13168. if (!isPowerOf2_64(NumBits))
  13169. NumBytes = 0;
  13170. }
  13171. if (!AM.Scale) {
  13172. int64_t Offset = AM.BaseOffs;
  13173. // 9-bit signed offset
  13174. if (isInt<9>(Offset))
  13175. return true;
  13176. // 12-bit unsigned offset
  13177. unsigned shift = Log2_64(NumBytes);
  13178. if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
  13179. // Must be a multiple of NumBytes (NumBytes is a power of 2)
  13180. (Offset >> shift) << shift == Offset)
  13181. return true;
  13182. return false;
  13183. }
  13184. // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
  13185. return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
  13186. }
  13187. bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
  13188. // Consider splitting large offset of struct or array.
  13189. return true;
  13190. }
  13191. bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
  13192. const MachineFunction &MF, EVT VT) const {
  13193. VT = VT.getScalarType();
  13194. if (!VT.isSimple())
  13195. return false;
  13196. switch (VT.getSimpleVT().SimpleTy) {
  13197. case MVT::f16:
  13198. return Subtarget->hasFullFP16();
  13199. case MVT::f32:
  13200. case MVT::f64:
  13201. return true;
  13202. default:
  13203. break;
  13204. }
  13205. return false;
  13206. }
  13207. bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
  13208. Type *Ty) const {
  13209. switch (Ty->getScalarType()->getTypeID()) {
  13210. case Type::FloatTyID:
  13211. case Type::DoubleTyID:
  13212. return true;
  13213. default:
  13214. return false;
  13215. }
  13216. }
  13217. bool AArch64TargetLowering::generateFMAsInMachineCombiner(
  13218. EVT VT, CodeGenOpt::Level OptLevel) const {
  13219. return (OptLevel >= CodeGenOpt::Aggressive) && !VT.isScalableVector() &&
  13220. !useSVEForFixedLengthVectorVT(VT);
  13221. }
  13222. const MCPhysReg *
  13223. AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
  13224. // LR is a callee-save register, but we must treat it as clobbered by any call
  13225. // site. Hence we include LR in the scratch registers, which are in turn added
  13226. // as implicit-defs for stackmaps and patchpoints.
  13227. static const MCPhysReg ScratchRegs[] = {
  13228. AArch64::X16, AArch64::X17, AArch64::LR, 0
  13229. };
  13230. return ScratchRegs;
  13231. }
  13232. bool
  13233. AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
  13234. CombineLevel Level) const {
  13235. assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
  13236. N->getOpcode() == ISD::SRL) &&
  13237. "Expected shift op");
  13238. SDValue ShiftLHS = N->getOperand(0);
  13239. EVT VT = N->getValueType(0);
  13240. // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
  13241. // combine it with shift 'N' to let it be lowered to UBFX except:
  13242. // ((x >> C) & mask) << C.
  13243. if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
  13244. isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
  13245. uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
  13246. if (isMask_64(TruncMask)) {
  13247. SDValue AndLHS = ShiftLHS.getOperand(0);
  13248. if (AndLHS.getOpcode() == ISD::SRL) {
  13249. if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
  13250. if (N->getOpcode() == ISD::SHL)
  13251. if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
  13252. return SRLC->getZExtValue() == SHLC->getZExtValue();
  13253. return false;
  13254. }
  13255. }
  13256. }
  13257. }
  13258. return true;
  13259. }
  13260. bool AArch64TargetLowering::isDesirableToCommuteXorWithShift(
  13261. const SDNode *N) const {
  13262. assert(N->getOpcode() == ISD::XOR &&
  13263. (N->getOperand(0).getOpcode() == ISD::SHL ||
  13264. N->getOperand(0).getOpcode() == ISD::SRL) &&
  13265. "Expected XOR(SHIFT) pattern");
  13266. // Only commute if the entire NOT mask is a hidden shifted mask.
  13267. auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
  13268. auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
  13269. if (XorC && ShiftC) {
  13270. unsigned MaskIdx, MaskLen;
  13271. if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
  13272. unsigned ShiftAmt = ShiftC->getZExtValue();
  13273. unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
  13274. if (N->getOperand(0).getOpcode() == ISD::SHL)
  13275. return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
  13276. return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
  13277. }
  13278. }
  13279. return false;
  13280. }
  13281. bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
  13282. const SDNode *N, CombineLevel Level) const {
  13283. assert(((N->getOpcode() == ISD::SHL &&
  13284. N->getOperand(0).getOpcode() == ISD::SRL) ||
  13285. (N->getOpcode() == ISD::SRL &&
  13286. N->getOperand(0).getOpcode() == ISD::SHL)) &&
  13287. "Expected shift-shift mask");
  13288. // Don't allow multiuse shift folding with the same shift amount.
  13289. if (!N->getOperand(0)->hasOneUse())
  13290. return false;
  13291. // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
  13292. EVT VT = N->getValueType(0);
  13293. if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
  13294. auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
  13295. auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
  13296. return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
  13297. }
  13298. return true;
  13299. }
  13300. bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
  13301. Type *Ty) const {
  13302. assert(Ty->isIntegerTy());
  13303. unsigned BitSize = Ty->getPrimitiveSizeInBits();
  13304. if (BitSize == 0)
  13305. return false;
  13306. int64_t Val = Imm.getSExtValue();
  13307. if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
  13308. return true;
  13309. if ((int64_t)Val < 0)
  13310. Val = ~Val;
  13311. if (BitSize == 32)
  13312. Val &= (1LL << 32) - 1;
  13313. unsigned LZ = countLeadingZeros((uint64_t)Val);
  13314. unsigned Shift = (63 - LZ) / 16;
  13315. // MOVZ is free so return true for one or fewer MOVK.
  13316. return Shift < 3;
  13317. }
  13318. bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
  13319. unsigned Index) const {
  13320. if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
  13321. return false;
  13322. return (Index == 0 || Index == ResVT.getVectorMinNumElements());
  13323. }
  13324. /// Turn vector tests of the signbit in the form of:
  13325. /// xor (sra X, elt_size(X)-1), -1
  13326. /// into:
  13327. /// cmge X, X, #0
  13328. static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
  13329. const AArch64Subtarget *Subtarget) {
  13330. EVT VT = N->getValueType(0);
  13331. if (!Subtarget->hasNEON() || !VT.isVector())
  13332. return SDValue();
  13333. // There must be a shift right algebraic before the xor, and the xor must be a
  13334. // 'not' operation.
  13335. SDValue Shift = N->getOperand(0);
  13336. SDValue Ones = N->getOperand(1);
  13337. if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
  13338. !ISD::isBuildVectorAllOnes(Ones.getNode()))
  13339. return SDValue();
  13340. // The shift should be smearing the sign bit across each vector element.
  13341. auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
  13342. EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
  13343. if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
  13344. return SDValue();
  13345. return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
  13346. }
  13347. // Given a vecreduce_add node, detect the below pattern and convert it to the
  13348. // node sequence with UABDL, [S|U]ADB and UADDLP.
  13349. //
  13350. // i32 vecreduce_add(
  13351. // v16i32 abs(
  13352. // v16i32 sub(
  13353. // v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
  13354. // =================>
  13355. // i32 vecreduce_add(
  13356. // v4i32 UADDLP(
  13357. // v8i16 add(
  13358. // v8i16 zext(
  13359. // v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
  13360. // v8i16 zext(
  13361. // v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
  13362. static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
  13363. SelectionDAG &DAG) {
  13364. // Assumed i32 vecreduce_add
  13365. if (N->getValueType(0) != MVT::i32)
  13366. return SDValue();
  13367. SDValue VecReduceOp0 = N->getOperand(0);
  13368. unsigned Opcode = VecReduceOp0.getOpcode();
  13369. // Assumed v16i32 abs
  13370. if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
  13371. return SDValue();
  13372. SDValue ABS = VecReduceOp0;
  13373. // Assumed v16i32 sub
  13374. if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
  13375. ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
  13376. return SDValue();
  13377. SDValue SUB = ABS->getOperand(0);
  13378. unsigned Opcode0 = SUB->getOperand(0).getOpcode();
  13379. unsigned Opcode1 = SUB->getOperand(1).getOpcode();
  13380. // Assumed v16i32 type
  13381. if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
  13382. SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
  13383. return SDValue();
  13384. // Assumed zext or sext
  13385. bool IsZExt = false;
  13386. if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
  13387. IsZExt = true;
  13388. } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
  13389. IsZExt = false;
  13390. } else
  13391. return SDValue();
  13392. SDValue EXT0 = SUB->getOperand(0);
  13393. SDValue EXT1 = SUB->getOperand(1);
  13394. // Assumed zext's operand has v16i8 type
  13395. if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
  13396. EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
  13397. return SDValue();
  13398. // Pattern is dectected. Let's convert it to sequence of nodes.
  13399. SDLoc DL(N);
  13400. // First, create the node pattern of UABD/SABD.
  13401. SDValue UABDHigh8Op0 =
  13402. DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
  13403. DAG.getConstant(8, DL, MVT::i64));
  13404. SDValue UABDHigh8Op1 =
  13405. DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
  13406. DAG.getConstant(8, DL, MVT::i64));
  13407. SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
  13408. UABDHigh8Op0, UABDHigh8Op1);
  13409. SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
  13410. // Second, create the node pattern of UABAL.
  13411. SDValue UABDLo8Op0 =
  13412. DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
  13413. DAG.getConstant(0, DL, MVT::i64));
  13414. SDValue UABDLo8Op1 =
  13415. DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
  13416. DAG.getConstant(0, DL, MVT::i64));
  13417. SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
  13418. UABDLo8Op0, UABDLo8Op1);
  13419. SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
  13420. SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
  13421. // Third, create the node of UADDLP.
  13422. SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
  13423. // Fourth, create the node of VECREDUCE_ADD.
  13424. return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
  13425. }
  13426. // Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
  13427. // vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
  13428. // vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
  13429. static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
  13430. const AArch64Subtarget *ST) {
  13431. if (!ST->hasDotProd())
  13432. return performVecReduceAddCombineWithUADDLP(N, DAG);
  13433. SDValue Op0 = N->getOperand(0);
  13434. if (N->getValueType(0) != MVT::i32 ||
  13435. Op0.getValueType().getVectorElementType() != MVT::i32)
  13436. return SDValue();
  13437. unsigned ExtOpcode = Op0.getOpcode();
  13438. SDValue A = Op0;
  13439. SDValue B;
  13440. if (ExtOpcode == ISD::MUL) {
  13441. A = Op0.getOperand(0);
  13442. B = Op0.getOperand(1);
  13443. if (A.getOpcode() != B.getOpcode() ||
  13444. A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
  13445. return SDValue();
  13446. ExtOpcode = A.getOpcode();
  13447. }
  13448. if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
  13449. return SDValue();
  13450. EVT Op0VT = A.getOperand(0).getValueType();
  13451. if (Op0VT != MVT::v8i8 && Op0VT != MVT::v16i8)
  13452. return SDValue();
  13453. SDLoc DL(Op0);
  13454. // For non-mla reductions B can be set to 1. For MLA we take the operand of
  13455. // the extend B.
  13456. if (!B)
  13457. B = DAG.getConstant(1, DL, Op0VT);
  13458. else
  13459. B = B.getOperand(0);
  13460. SDValue Zeros =
  13461. DAG.getConstant(0, DL, Op0VT == MVT::v8i8 ? MVT::v2i32 : MVT::v4i32);
  13462. auto DotOpcode =
  13463. (ExtOpcode == ISD::ZERO_EXTEND) ? AArch64ISD::UDOT : AArch64ISD::SDOT;
  13464. SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
  13465. A.getOperand(0), B);
  13466. return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
  13467. }
  13468. // Given an (integer) vecreduce, we know the order of the inputs does not
  13469. // matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
  13470. // into UADDV(UADDLP(x)). This can also happen through an extra add, where we
  13471. // transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
  13472. static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
  13473. auto DetectAddExtract = [&](SDValue A) {
  13474. // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
  13475. // UADDLP(x) if found.
  13476. if (A.getOpcode() != ISD::ADD)
  13477. return SDValue();
  13478. EVT VT = A.getValueType();
  13479. SDValue Op0 = A.getOperand(0);
  13480. SDValue Op1 = A.getOperand(1);
  13481. if (Op0.getOpcode() != Op0.getOpcode() ||
  13482. (Op0.getOpcode() != ISD::ZERO_EXTEND &&
  13483. Op0.getOpcode() != ISD::SIGN_EXTEND))
  13484. return SDValue();
  13485. SDValue Ext0 = Op0.getOperand(0);
  13486. SDValue Ext1 = Op1.getOperand(0);
  13487. if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
  13488. Ext1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
  13489. Ext0.getOperand(0) != Ext1.getOperand(0))
  13490. return SDValue();
  13491. // Check that the type is twice the add types, and the extract are from
  13492. // upper/lower parts of the same source.
  13493. if (Ext0.getOperand(0).getValueType().getVectorNumElements() !=
  13494. VT.getVectorNumElements() * 2)
  13495. return SDValue();
  13496. if ((Ext0.getConstantOperandVal(1) != 0 &&
  13497. Ext1.getConstantOperandVal(1) != VT.getVectorNumElements()) &&
  13498. (Ext1.getConstantOperandVal(1) != 0 &&
  13499. Ext0.getConstantOperandVal(1) != VT.getVectorNumElements()))
  13500. return SDValue();
  13501. unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
  13502. : AArch64ISD::SADDLP;
  13503. return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
  13504. };
  13505. SDValue A = N->getOperand(0);
  13506. if (SDValue R = DetectAddExtract(A))
  13507. return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
  13508. if (A.getOpcode() == ISD::ADD) {
  13509. if (SDValue R = DetectAddExtract(A.getOperand(0)))
  13510. return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
  13511. DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
  13512. A.getOperand(1)));
  13513. if (SDValue R = DetectAddExtract(A.getOperand(1)))
  13514. return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
  13515. DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
  13516. A.getOperand(0)));
  13517. }
  13518. return SDValue();
  13519. }
  13520. static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
  13521. TargetLowering::DAGCombinerInfo &DCI,
  13522. const AArch64Subtarget *Subtarget) {
  13523. if (DCI.isBeforeLegalizeOps())
  13524. return SDValue();
  13525. return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
  13526. }
  13527. SDValue
  13528. AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
  13529. SelectionDAG &DAG,
  13530. SmallVectorImpl<SDNode *> &Created) const {
  13531. AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
  13532. if (isIntDivCheap(N->getValueType(0), Attr))
  13533. return SDValue(N,0); // Lower SDIV as SDIV
  13534. EVT VT = N->getValueType(0);
  13535. // For scalable and fixed types, mark them as cheap so we can handle it much
  13536. // later. This allows us to handle larger than legal types.
  13537. if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
  13538. return SDValue(N, 0);
  13539. // fold (sdiv X, pow2)
  13540. if ((VT != MVT::i32 && VT != MVT::i64) ||
  13541. !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
  13542. return SDValue();
  13543. SDLoc DL(N);
  13544. SDValue N0 = N->getOperand(0);
  13545. unsigned Lg2 = Divisor.countTrailingZeros();
  13546. SDValue Zero = DAG.getConstant(0, DL, VT);
  13547. SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
  13548. // Add (N0 < 0) ? Pow2 - 1 : 0;
  13549. SDValue CCVal;
  13550. SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
  13551. SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
  13552. SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
  13553. Created.push_back(Cmp.getNode());
  13554. Created.push_back(Add.getNode());
  13555. Created.push_back(CSel.getNode());
  13556. // Divide by pow2.
  13557. SDValue SRA =
  13558. DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));
  13559. // If we're dividing by a positive value, we're done. Otherwise, we must
  13560. // negate the result.
  13561. if (Divisor.isNonNegative())
  13562. return SRA;
  13563. Created.push_back(SRA.getNode());
  13564. return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
  13565. }
  13566. SDValue
  13567. AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
  13568. SelectionDAG &DAG,
  13569. SmallVectorImpl<SDNode *> &Created) const {
  13570. AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
  13571. if (isIntDivCheap(N->getValueType(0), Attr))
  13572. return SDValue(N, 0); // Lower SREM as SREM
  13573. EVT VT = N->getValueType(0);
  13574. // For scalable and fixed types, mark them as cheap so we can handle it much
  13575. // later. This allows us to handle larger than legal types.
  13576. if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
  13577. return SDValue(N, 0);
  13578. // fold (srem X, pow2)
  13579. if ((VT != MVT::i32 && VT != MVT::i64) ||
  13580. !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
  13581. return SDValue();
  13582. unsigned Lg2 = Divisor.countTrailingZeros();
  13583. if (Lg2 == 0)
  13584. return SDValue();
  13585. SDLoc DL(N);
  13586. SDValue N0 = N->getOperand(0);
  13587. SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
  13588. SDValue Zero = DAG.getConstant(0, DL, VT);
  13589. SDValue CCVal, CSNeg;
  13590. if (Lg2 == 1) {
  13591. SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
  13592. SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
  13593. CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
  13594. Created.push_back(Cmp.getNode());
  13595. Created.push_back(And.getNode());
  13596. } else {
  13597. SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
  13598. SDVTList VTs = DAG.getVTList(VT, MVT::i32);
  13599. SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
  13600. SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
  13601. SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
  13602. CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
  13603. Negs.getValue(1));
  13604. Created.push_back(Negs.getNode());
  13605. Created.push_back(AndPos.getNode());
  13606. Created.push_back(AndNeg.getNode());
  13607. }
  13608. return CSNeg;
  13609. }
  13610. static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
  13611. switch(getIntrinsicID(S.getNode())) {
  13612. default:
  13613. break;
  13614. case Intrinsic::aarch64_sve_cntb:
  13615. return 8;
  13616. case Intrinsic::aarch64_sve_cnth:
  13617. return 16;
  13618. case Intrinsic::aarch64_sve_cntw:
  13619. return 32;
  13620. case Intrinsic::aarch64_sve_cntd:
  13621. return 64;
  13622. }
  13623. return {};
  13624. }
  13625. /// Calculates what the pre-extend type is, based on the extension
  13626. /// operation node provided by \p Extend.
  13627. ///
  13628. /// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
  13629. /// pre-extend type is pulled directly from the operand, while other extend
  13630. /// operations need a bit more inspection to get this information.
  13631. ///
  13632. /// \param Extend The SDNode from the DAG that represents the extend operation
  13633. ///
  13634. /// \returns The type representing the \p Extend source type, or \p MVT::Other
  13635. /// if no valid type can be determined
  13636. static EVT calculatePreExtendType(SDValue Extend) {
  13637. switch (Extend.getOpcode()) {
  13638. case ISD::SIGN_EXTEND:
  13639. case ISD::ZERO_EXTEND:
  13640. return Extend.getOperand(0).getValueType();
  13641. case ISD::AssertSext:
  13642. case ISD::AssertZext:
  13643. case ISD::SIGN_EXTEND_INREG: {
  13644. VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
  13645. if (!TypeNode)
  13646. return MVT::Other;
  13647. return TypeNode->getVT();
  13648. }
  13649. case ISD::AND: {
  13650. ConstantSDNode *Constant =
  13651. dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
  13652. if (!Constant)
  13653. return MVT::Other;
  13654. uint32_t Mask = Constant->getZExtValue();
  13655. if (Mask == UCHAR_MAX)
  13656. return MVT::i8;
  13657. else if (Mask == USHRT_MAX)
  13658. return MVT::i16;
  13659. else if (Mask == UINT_MAX)
  13660. return MVT::i32;
  13661. return MVT::Other;
  13662. }
  13663. default:
  13664. return MVT::Other;
  13665. }
  13666. }
  13667. /// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
  13668. /// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
  13669. /// SExt/ZExt rather than the scalar SExt/ZExt
  13670. static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) {
  13671. EVT VT = BV.getValueType();
  13672. if (BV.getOpcode() != ISD::BUILD_VECTOR &&
  13673. BV.getOpcode() != ISD::VECTOR_SHUFFLE)
  13674. return SDValue();
  13675. // Use the first item in the buildvector/shuffle to get the size of the
  13676. // extend, and make sure it looks valid.
  13677. SDValue Extend = BV->getOperand(0);
  13678. unsigned ExtendOpcode = Extend.getOpcode();
  13679. bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
  13680. ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
  13681. ExtendOpcode == ISD::AssertSext;
  13682. if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
  13683. ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
  13684. return SDValue();
  13685. // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
  13686. // calculatePreExtendType will work without issue.
  13687. if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
  13688. ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
  13689. return SDValue();
  13690. // Restrict valid pre-extend data type
  13691. EVT PreExtendType = calculatePreExtendType(Extend);
  13692. if (PreExtendType == MVT::Other ||
  13693. PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
  13694. return SDValue();
  13695. // Make sure all other operands are equally extended
  13696. for (SDValue Op : drop_begin(BV->ops())) {
  13697. if (Op.isUndef())
  13698. continue;
  13699. unsigned Opc = Op.getOpcode();
  13700. bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
  13701. Opc == ISD::AssertSext;
  13702. if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType)
  13703. return SDValue();
  13704. }
  13705. SDValue NBV;
  13706. SDLoc DL(BV);
  13707. if (BV.getOpcode() == ISD::BUILD_VECTOR) {
  13708. EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
  13709. EVT PreExtendLegalType =
  13710. PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
  13711. SmallVector<SDValue, 8> NewOps;
  13712. for (SDValue Op : BV->ops())
  13713. NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
  13714. : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
  13715. PreExtendLegalType));
  13716. NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
  13717. } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
  13718. EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
  13719. NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
  13720. BV.getOperand(1).isUndef()
  13721. ? DAG.getUNDEF(PreExtendVT)
  13722. : BV.getOperand(1).getOperand(0),
  13723. cast<ShuffleVectorSDNode>(BV)->getMask());
  13724. }
  13725. return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV);
  13726. }
  13727. /// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
  13728. /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
  13729. static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
  13730. // If the value type isn't a vector, none of the operands are going to be dups
  13731. EVT VT = Mul->getValueType(0);
  13732. if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
  13733. return SDValue();
  13734. SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
  13735. SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
  13736. // Neither operands have been changed, don't make any further changes
  13737. if (!Op0 && !Op1)
  13738. return SDValue();
  13739. SDLoc DL(Mul);
  13740. return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
  13741. Op1 ? Op1 : Mul->getOperand(1));
  13742. }
  13743. // Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
  13744. // Same for other types with equivalent constants.
  13745. static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) {
  13746. EVT VT = N->getValueType(0);
  13747. if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
  13748. VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
  13749. return SDValue();
  13750. if (N->getOperand(0).getOpcode() != ISD::AND ||
  13751. N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
  13752. return SDValue();
  13753. SDValue And = N->getOperand(0);
  13754. SDValue Srl = And.getOperand(0);
  13755. APInt V1, V2, V3;
  13756. if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
  13757. !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
  13758. !ISD::isConstantSplatVector(Srl.getOperand(1).getNode(), V3))
  13759. return SDValue();
  13760. unsigned HalfSize = VT.getScalarSizeInBits() / 2;
  13761. if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
  13762. V3 != (HalfSize - 1))
  13763. return SDValue();
  13764. EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
  13765. EVT::getIntegerVT(*DAG.getContext(), HalfSize),
  13766. VT.getVectorElementCount() * 2);
  13767. SDLoc DL(N);
  13768. SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
  13769. SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
  13770. return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
  13771. }
  13772. static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
  13773. TargetLowering::DAGCombinerInfo &DCI,
  13774. const AArch64Subtarget *Subtarget) {
  13775. if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
  13776. return Ext;
  13777. if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG))
  13778. return Ext;
  13779. if (DCI.isBeforeLegalizeOps())
  13780. return SDValue();
  13781. // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
  13782. // and in MachineCombiner pass, add+mul will be combined into madd.
  13783. // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
  13784. SDLoc DL(N);
  13785. EVT VT = N->getValueType(0);
  13786. SDValue N0 = N->getOperand(0);
  13787. SDValue N1 = N->getOperand(1);
  13788. SDValue MulOper;
  13789. unsigned AddSubOpc;
  13790. auto IsAddSubWith1 = [&](SDValue V) -> bool {
  13791. AddSubOpc = V->getOpcode();
  13792. if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
  13793. SDValue Opnd = V->getOperand(1);
  13794. MulOper = V->getOperand(0);
  13795. if (AddSubOpc == ISD::SUB)
  13796. std::swap(Opnd, MulOper);
  13797. if (auto C = dyn_cast<ConstantSDNode>(Opnd))
  13798. return C->isOne();
  13799. }
  13800. return false;
  13801. };
  13802. if (IsAddSubWith1(N0)) {
  13803. SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
  13804. return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
  13805. }
  13806. if (IsAddSubWith1(N1)) {
  13807. SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
  13808. return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
  13809. }
  13810. // The below optimizations require a constant RHS.
  13811. if (!isa<ConstantSDNode>(N1))
  13812. return SDValue();
  13813. ConstantSDNode *C = cast<ConstantSDNode>(N1);
  13814. const APInt &ConstValue = C->getAPIntValue();
  13815. // Allow the scaling to be folded into the `cnt` instruction by preventing
  13816. // the scaling to be obscured here. This makes it easier to pattern match.
  13817. if (IsSVECntIntrinsic(N0) ||
  13818. (N0->getOpcode() == ISD::TRUNCATE &&
  13819. (IsSVECntIntrinsic(N0->getOperand(0)))))
  13820. if (ConstValue.sge(1) && ConstValue.sle(16))
  13821. return SDValue();
  13822. // Multiplication of a power of two plus/minus one can be done more
  13823. // cheaply as as shift+add/sub. For now, this is true unilaterally. If
  13824. // future CPUs have a cheaper MADD instruction, this may need to be
  13825. // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
  13826. // 64-bit is 5 cycles, so this is always a win.
  13827. // More aggressively, some multiplications N0 * C can be lowered to
  13828. // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
  13829. // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
  13830. // TODO: lower more cases.
  13831. // TrailingZeroes is used to test if the mul can be lowered to
  13832. // shift+add+shift.
  13833. unsigned TrailingZeroes = ConstValue.countTrailingZeros();
  13834. if (TrailingZeroes) {
  13835. // Conservatively do not lower to shift+add+shift if the mul might be
  13836. // folded into smul or umul.
  13837. if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) ||
  13838. isZeroExtended(N0.getNode(), DAG)))
  13839. return SDValue();
  13840. // Conservatively do not lower to shift+add+shift if the mul might be
  13841. // folded into madd or msub.
  13842. if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
  13843. N->use_begin()->getOpcode() == ISD::SUB))
  13844. return SDValue();
  13845. }
  13846. // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
  13847. // and shift+add+shift.
  13848. APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
  13849. unsigned ShiftAmt;
  13850. auto Shl = [&](SDValue N0, unsigned N1) {
  13851. SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
  13852. return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
  13853. };
  13854. auto Add = [&](SDValue N0, SDValue N1) {
  13855. return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
  13856. };
  13857. auto Sub = [&](SDValue N0, SDValue N1) {
  13858. return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
  13859. };
  13860. auto Negate = [&](SDValue N) {
  13861. SDValue Zero = DAG.getConstant(0, DL, VT);
  13862. return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
  13863. };
  13864. // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
  13865. // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
  13866. // the (2^N - 1) can't be execused via a single instruction.
  13867. auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
  13868. unsigned BitWidth = C.getBitWidth();
  13869. for (unsigned i = 1; i < BitWidth / 2; i++) {
  13870. APInt Rem;
  13871. APInt X(BitWidth, (1 << i) + 1);
  13872. APInt::sdivrem(C, X, N, Rem);
  13873. APInt NVMinus1 = N - 1;
  13874. if (Rem == 0 && NVMinus1.isPowerOf2()) {
  13875. M = X;
  13876. return true;
  13877. }
  13878. }
  13879. return false;
  13880. };
  13881. if (ConstValue.isNonNegative()) {
  13882. // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
  13883. // (mul x, 2^N - 1) => (sub (shl x, N), x)
  13884. // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
  13885. // (mul x, (2^M + 1) * (2^N + 1))
  13886. // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
  13887. APInt SCVMinus1 = ShiftedConstValue - 1;
  13888. APInt SCVPlus1 = ShiftedConstValue + 1;
  13889. APInt CVPlus1 = ConstValue + 1;
  13890. APInt CVM, CVN;
  13891. if (SCVMinus1.isPowerOf2()) {
  13892. ShiftAmt = SCVMinus1.logBase2();
  13893. return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
  13894. } else if (CVPlus1.isPowerOf2()) {
  13895. ShiftAmt = CVPlus1.logBase2();
  13896. return Sub(Shl(N0, ShiftAmt), N0);
  13897. } else if (SCVPlus1.isPowerOf2()) {
  13898. ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
  13899. return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
  13900. } else if (Subtarget->hasLSLFast() &&
  13901. isPowPlusPlusConst(ConstValue, CVM, CVN)) {
  13902. APInt CVMMinus1 = CVM - 1;
  13903. APInt CVNMinus1 = CVN - 1;
  13904. unsigned ShiftM1 = CVMMinus1.logBase2();
  13905. unsigned ShiftN1 = CVNMinus1.logBase2();
  13906. // LSLFast implicate that Shifts <= 3 places are fast
  13907. if (ShiftM1 <= 3 && ShiftN1 <= 3) {
  13908. SDValue MVal = Add(Shl(N0, ShiftM1), N0);
  13909. return Add(Shl(MVal, ShiftN1), MVal);
  13910. }
  13911. }
  13912. } else {
  13913. // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
  13914. // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
  13915. // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
  13916. APInt SCVPlus1 = -ShiftedConstValue + 1;
  13917. APInt CVNegPlus1 = -ConstValue + 1;
  13918. APInt CVNegMinus1 = -ConstValue - 1;
  13919. if (CVNegPlus1.isPowerOf2()) {
  13920. ShiftAmt = CVNegPlus1.logBase2();
  13921. return Sub(N0, Shl(N0, ShiftAmt));
  13922. } else if (CVNegMinus1.isPowerOf2()) {
  13923. ShiftAmt = CVNegMinus1.logBase2();
  13924. return Negate(Add(Shl(N0, ShiftAmt), N0));
  13925. } else if (SCVPlus1.isPowerOf2()) {
  13926. ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
  13927. return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
  13928. }
  13929. }
  13930. return SDValue();
  13931. }
  13932. static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
  13933. SelectionDAG &DAG) {
  13934. // Take advantage of vector comparisons producing 0 or -1 in each lane to
  13935. // optimize away operation when it's from a constant.
  13936. //
  13937. // The general transformation is:
  13938. // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
  13939. // AND(VECTOR_CMP(x,y), constant2)
  13940. // constant2 = UNARYOP(constant)
  13941. // Early exit if this isn't a vector operation, the operand of the
  13942. // unary operation isn't a bitwise AND, or if the sizes of the operations
  13943. // aren't the same.
  13944. EVT VT = N->getValueType(0);
  13945. if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
  13946. N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
  13947. VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
  13948. return SDValue();
  13949. // Now check that the other operand of the AND is a constant. We could
  13950. // make the transformation for non-constant splats as well, but it's unclear
  13951. // that would be a benefit as it would not eliminate any operations, just
  13952. // perform one more step in scalar code before moving to the vector unit.
  13953. if (BuildVectorSDNode *BV =
  13954. dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
  13955. // Bail out if the vector isn't a constant.
  13956. if (!BV->isConstant())
  13957. return SDValue();
  13958. // Everything checks out. Build up the new and improved node.
  13959. SDLoc DL(N);
  13960. EVT IntVT = BV->getValueType(0);
  13961. // Create a new constant of the appropriate type for the transformed
  13962. // DAG.
  13963. SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
  13964. // The AND node needs bitcasts to/from an integer vector type around it.
  13965. SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
  13966. SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
  13967. N->getOperand(0)->getOperand(0), MaskConst);
  13968. SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
  13969. return Res;
  13970. }
  13971. return SDValue();
  13972. }
  13973. static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
  13974. const AArch64Subtarget *Subtarget) {
  13975. // First try to optimize away the conversion when it's conditionally from
  13976. // a constant. Vectors only.
  13977. if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
  13978. return Res;
  13979. EVT VT = N->getValueType(0);
  13980. if (VT != MVT::f32 && VT != MVT::f64)
  13981. return SDValue();
  13982. // Only optimize when the source and destination types have the same width.
  13983. if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
  13984. return SDValue();
  13985. // If the result of an integer load is only used by an integer-to-float
  13986. // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
  13987. // This eliminates an "integer-to-vector-move" UOP and improves throughput.
  13988. SDValue N0 = N->getOperand(0);
  13989. if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
  13990. // Do not change the width of a volatile load.
  13991. !cast<LoadSDNode>(N0)->isVolatile()) {
  13992. LoadSDNode *LN0 = cast<LoadSDNode>(N0);
  13993. SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
  13994. LN0->getPointerInfo(), LN0->getAlign(),
  13995. LN0->getMemOperand()->getFlags());
  13996. // Make sure successors of the original load stay after it by updating them
  13997. // to use the new Chain.
  13998. DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
  13999. unsigned Opcode =
  14000. (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
  14001. return DAG.getNode(Opcode, SDLoc(N), VT, Load);
  14002. }
  14003. return SDValue();
  14004. }
  14005. /// Fold a floating-point multiply by power of two into floating-point to
  14006. /// fixed-point conversion.
  14007. static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
  14008. TargetLowering::DAGCombinerInfo &DCI,
  14009. const AArch64Subtarget *Subtarget) {
  14010. if (!Subtarget->hasNEON() || Subtarget->forceStreamingCompatibleSVE())
  14011. return SDValue();
  14012. if (!N->getValueType(0).isSimple())
  14013. return SDValue();
  14014. SDValue Op = N->getOperand(0);
  14015. if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
  14016. return SDValue();
  14017. if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
  14018. return SDValue();
  14019. SDValue ConstVec = Op->getOperand(1);
  14020. if (!isa<BuildVectorSDNode>(ConstVec))
  14021. return SDValue();
  14022. MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
  14023. uint32_t FloatBits = FloatTy.getSizeInBits();
  14024. if (FloatBits != 32 && FloatBits != 64 &&
  14025. (FloatBits != 16 || !Subtarget->hasFullFP16()))
  14026. return SDValue();
  14027. MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
  14028. uint32_t IntBits = IntTy.getSizeInBits();
  14029. if (IntBits != 16 && IntBits != 32 && IntBits != 64)
  14030. return SDValue();
  14031. // Avoid conversions where iN is larger than the float (e.g., float -> i64).
  14032. if (IntBits > FloatBits)
  14033. return SDValue();
  14034. BitVector UndefElements;
  14035. BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
  14036. int32_t Bits = IntBits == 64 ? 64 : 32;
  14037. int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
  14038. if (C == -1 || C == 0 || C > Bits)
  14039. return SDValue();
  14040. EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
  14041. if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
  14042. return SDValue();
  14043. if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
  14044. N->getOpcode() == ISD::FP_TO_UINT_SAT) {
  14045. EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
  14046. if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
  14047. return SDValue();
  14048. }
  14049. SDLoc DL(N);
  14050. bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
  14051. N->getOpcode() == ISD::FP_TO_SINT_SAT);
  14052. unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
  14053. : Intrinsic::aarch64_neon_vcvtfp2fxu;
  14054. SDValue FixConv =
  14055. DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
  14056. DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
  14057. Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
  14058. // We can handle smaller integers by generating an extra trunc.
  14059. if (IntBits < FloatBits)
  14060. FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
  14061. return FixConv;
  14062. }
  14063. /// Fold a floating-point divide by power of two into fixed-point to
  14064. /// floating-point conversion.
  14065. static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
  14066. TargetLowering::DAGCombinerInfo &DCI,
  14067. const AArch64Subtarget *Subtarget) {
  14068. if (!Subtarget->hasNEON())
  14069. return SDValue();
  14070. SDValue Op = N->getOperand(0);
  14071. unsigned Opc = Op->getOpcode();
  14072. if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
  14073. !Op.getOperand(0).getValueType().isSimple() ||
  14074. (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
  14075. return SDValue();
  14076. SDValue ConstVec = N->getOperand(1);
  14077. if (!isa<BuildVectorSDNode>(ConstVec))
  14078. return SDValue();
  14079. MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
  14080. int32_t IntBits = IntTy.getSizeInBits();
  14081. if (IntBits != 16 && IntBits != 32 && IntBits != 64)
  14082. return SDValue();
  14083. MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
  14084. int32_t FloatBits = FloatTy.getSizeInBits();
  14085. if (FloatBits != 32 && FloatBits != 64)
  14086. return SDValue();
  14087. // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
  14088. if (IntBits > FloatBits)
  14089. return SDValue();
  14090. BitVector UndefElements;
  14091. BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
  14092. int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
  14093. if (C == -1 || C == 0 || C > FloatBits)
  14094. return SDValue();
  14095. MVT ResTy;
  14096. unsigned NumLanes = Op.getValueType().getVectorNumElements();
  14097. switch (NumLanes) {
  14098. default:
  14099. return SDValue();
  14100. case 2:
  14101. ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
  14102. break;
  14103. case 4:
  14104. ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
  14105. break;
  14106. }
  14107. if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
  14108. return SDValue();
  14109. SDLoc DL(N);
  14110. SDValue ConvInput = Op.getOperand(0);
  14111. bool IsSigned = Opc == ISD::SINT_TO_FP;
  14112. if (IntBits < FloatBits)
  14113. ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
  14114. ResTy, ConvInput);
  14115. unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
  14116. : Intrinsic::aarch64_neon_vcvtfxu2fp;
  14117. return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
  14118. DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
  14119. DAG.getConstant(C, DL, MVT::i32));
  14120. }
  14121. /// An EXTR instruction is made up of two shifts, ORed together. This helper
  14122. /// searches for and classifies those shifts.
  14123. static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
  14124. bool &FromHi) {
  14125. if (N.getOpcode() == ISD::SHL)
  14126. FromHi = false;
  14127. else if (N.getOpcode() == ISD::SRL)
  14128. FromHi = true;
  14129. else
  14130. return false;
  14131. if (!isa<ConstantSDNode>(N.getOperand(1)))
  14132. return false;
  14133. ShiftAmount = N->getConstantOperandVal(1);
  14134. Src = N->getOperand(0);
  14135. return true;
  14136. }
  14137. /// EXTR instruction extracts a contiguous chunk of bits from two existing
  14138. /// registers viewed as a high/low pair. This function looks for the pattern:
  14139. /// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
  14140. /// with an EXTR. Can't quite be done in TableGen because the two immediates
  14141. /// aren't independent.
  14142. static SDValue tryCombineToEXTR(SDNode *N,
  14143. TargetLowering::DAGCombinerInfo &DCI) {
  14144. SelectionDAG &DAG = DCI.DAG;
  14145. SDLoc DL(N);
  14146. EVT VT = N->getValueType(0);
  14147. assert(N->getOpcode() == ISD::OR && "Unexpected root");
  14148. if (VT != MVT::i32 && VT != MVT::i64)
  14149. return SDValue();
  14150. SDValue LHS;
  14151. uint32_t ShiftLHS = 0;
  14152. bool LHSFromHi = false;
  14153. if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
  14154. return SDValue();
  14155. SDValue RHS;
  14156. uint32_t ShiftRHS = 0;
  14157. bool RHSFromHi = false;
  14158. if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
  14159. return SDValue();
  14160. // If they're both trying to come from the high part of the register, they're
  14161. // not really an EXTR.
  14162. if (LHSFromHi == RHSFromHi)
  14163. return SDValue();
  14164. if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
  14165. return SDValue();
  14166. if (LHSFromHi) {
  14167. std::swap(LHS, RHS);
  14168. std::swap(ShiftLHS, ShiftRHS);
  14169. }
  14170. return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
  14171. DAG.getConstant(ShiftRHS, DL, MVT::i64));
  14172. }
  14173. static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
  14174. const AArch64TargetLowering &TLI) {
  14175. EVT VT = N->getValueType(0);
  14176. SelectionDAG &DAG = DCI.DAG;
  14177. SDLoc DL(N);
  14178. if (!VT.isVector())
  14179. return SDValue();
  14180. // The combining code currently only works for NEON vectors. In particular,
  14181. // it does not work for SVE when dealing with vectors wider than 128 bits.
  14182. // It also doesn't work for streaming mode because it causes generating
  14183. // bsl instructions that are invalid in streaming mode.
  14184. if (TLI.useSVEForFixedLengthVectorVT(
  14185. VT,
  14186. DAG.getSubtarget<AArch64Subtarget>().forceStreamingCompatibleSVE()))
  14187. return SDValue();
  14188. SDValue N0 = N->getOperand(0);
  14189. if (N0.getOpcode() != ISD::AND)
  14190. return SDValue();
  14191. SDValue N1 = N->getOperand(1);
  14192. if (N1.getOpcode() != ISD::AND)
  14193. return SDValue();
  14194. // InstCombine does (not (neg a)) => (add a -1).
  14195. // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
  14196. // Loop over all combinations of AND operands.
  14197. for (int i = 1; i >= 0; --i) {
  14198. for (int j = 1; j >= 0; --j) {
  14199. SDValue O0 = N0->getOperand(i);
  14200. SDValue O1 = N1->getOperand(j);
  14201. SDValue Sub, Add, SubSibling, AddSibling;
  14202. // Find a SUB and an ADD operand, one from each AND.
  14203. if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
  14204. Sub = O0;
  14205. Add = O1;
  14206. SubSibling = N0->getOperand(1 - i);
  14207. AddSibling = N1->getOperand(1 - j);
  14208. } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
  14209. Add = O0;
  14210. Sub = O1;
  14211. AddSibling = N0->getOperand(1 - i);
  14212. SubSibling = N1->getOperand(1 - j);
  14213. } else
  14214. continue;
  14215. if (!ISD::isBuildVectorAllZeros(Sub.getOperand(0).getNode()))
  14216. continue;
  14217. // Constant ones is always righthand operand of the Add.
  14218. if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode()))
  14219. continue;
  14220. if (Sub.getOperand(1) != Add.getOperand(0))
  14221. continue;
  14222. return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
  14223. }
  14224. }
  14225. // (or (and a b) (and (not a) c)) => (bsl a b c)
  14226. // We only have to look for constant vectors here since the general, variable
  14227. // case can be handled in TableGen.
  14228. unsigned Bits = VT.getScalarSizeInBits();
  14229. uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
  14230. for (int i = 1; i >= 0; --i)
  14231. for (int j = 1; j >= 0; --j) {
  14232. BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
  14233. BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
  14234. if (!BVN0 || !BVN1)
  14235. continue;
  14236. bool FoundMatch = true;
  14237. for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
  14238. ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
  14239. ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
  14240. if (!CN0 || !CN1 ||
  14241. CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
  14242. FoundMatch = false;
  14243. break;
  14244. }
  14245. }
  14246. if (FoundMatch)
  14247. return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
  14248. N0->getOperand(1 - i), N1->getOperand(1 - j));
  14249. }
  14250. return SDValue();
  14251. }
  14252. // Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
  14253. // convert to csel(ccmp(.., cc0)), depending on cc1:
  14254. // (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
  14255. // =>
  14256. // (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
  14257. //
  14258. // (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
  14259. // =>
  14260. // (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
  14261. static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
  14262. EVT VT = N->getValueType(0);
  14263. SDValue CSel0 = N->getOperand(0);
  14264. SDValue CSel1 = N->getOperand(1);
  14265. if (CSel0.getOpcode() != AArch64ISD::CSEL ||
  14266. CSel1.getOpcode() != AArch64ISD::CSEL)
  14267. return SDValue();
  14268. if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
  14269. return SDValue();
  14270. if (!isNullConstant(CSel0.getOperand(0)) ||
  14271. !isOneConstant(CSel0.getOperand(1)) ||
  14272. !isNullConstant(CSel1.getOperand(0)) ||
  14273. !isOneConstant(CSel1.getOperand(1)))
  14274. return SDValue();
  14275. SDValue Cmp0 = CSel0.getOperand(3);
  14276. SDValue Cmp1 = CSel1.getOperand(3);
  14277. AArch64CC::CondCode CC0 = (AArch64CC::CondCode)CSel0.getConstantOperandVal(2);
  14278. AArch64CC::CondCode CC1 = (AArch64CC::CondCode)CSel1.getConstantOperandVal(2);
  14279. if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
  14280. return SDValue();
  14281. if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
  14282. Cmp0.getOpcode() == AArch64ISD::SUBS) {
  14283. std::swap(Cmp0, Cmp1);
  14284. std::swap(CC0, CC1);
  14285. }
  14286. if (Cmp1.getOpcode() != AArch64ISD::SUBS)
  14287. return SDValue();
  14288. SDLoc DL(N);
  14289. SDValue CCmp, Condition;
  14290. unsigned NZCV;
  14291. if (N->getOpcode() == ISD::AND) {
  14292. AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(CC0);
  14293. Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
  14294. NZCV = AArch64CC::getNZCVToSatisfyCondCode(CC1);
  14295. } else {
  14296. AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(CC1);
  14297. Condition = DAG.getConstant(CC0, DL, MVT_CC);
  14298. NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvCC1);
  14299. }
  14300. SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
  14301. auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
  14302. if (Op1 && Op1->getAPIntValue().isNegative() &&
  14303. Op1->getAPIntValue().sgt(-32)) {
  14304. // CCMP accept the constant int the range [0, 31]
  14305. // if the Op1 is a constant in the range [-31, -1], we
  14306. // can select to CCMN to avoid the extra mov
  14307. SDValue AbsOp1 =
  14308. DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
  14309. CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
  14310. NZCVOp, Condition, Cmp0);
  14311. } else {
  14312. CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
  14313. Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
  14314. }
  14315. return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
  14316. CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
  14317. CCmp);
  14318. }
  14319. static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
  14320. const AArch64Subtarget *Subtarget,
  14321. const AArch64TargetLowering &TLI) {
  14322. SelectionDAG &DAG = DCI.DAG;
  14323. EVT VT = N->getValueType(0);
  14324. if (SDValue R = performANDORCSELCombine(N, DAG))
  14325. return R;
  14326. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
  14327. return SDValue();
  14328. // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
  14329. if (SDValue Res = tryCombineToEXTR(N, DCI))
  14330. return Res;
  14331. if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
  14332. return Res;
  14333. return SDValue();
  14334. }
  14335. static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) {
  14336. if (!MemVT.getVectorElementType().isSimple())
  14337. return false;
  14338. uint64_t MaskForTy = 0ull;
  14339. switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
  14340. case MVT::i8:
  14341. MaskForTy = 0xffull;
  14342. break;
  14343. case MVT::i16:
  14344. MaskForTy = 0xffffull;
  14345. break;
  14346. case MVT::i32:
  14347. MaskForTy = 0xffffffffull;
  14348. break;
  14349. default:
  14350. return false;
  14351. break;
  14352. }
  14353. if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
  14354. if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
  14355. return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
  14356. return false;
  14357. }
  14358. static bool isAllInactivePredicate(SDValue N) {
  14359. // Look through cast.
  14360. while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
  14361. N = N.getOperand(0);
  14362. return ISD::isConstantSplatVectorAllZeros(N.getNode());
  14363. }
  14364. static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) {
  14365. unsigned NumElts = N.getValueType().getVectorMinNumElements();
  14366. // Look through cast.
  14367. while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
  14368. N = N.getOperand(0);
  14369. // When reinterpreting from a type with fewer elements the "new" elements
  14370. // are not active, so bail if they're likely to be used.
  14371. if (N.getValueType().getVectorMinNumElements() < NumElts)
  14372. return false;
  14373. }
  14374. if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
  14375. return true;
  14376. // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
  14377. // or smaller than the implicit element type represented by N.
  14378. // NOTE: A larger element count implies a smaller element type.
  14379. if (N.getOpcode() == AArch64ISD::PTRUE &&
  14380. N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
  14381. return N.getValueType().getVectorMinNumElements() >= NumElts;
  14382. // If we're compiling for a specific vector-length, we can check if the
  14383. // pattern's VL equals that of the scalable vector at runtime.
  14384. if (N.getOpcode() == AArch64ISD::PTRUE) {
  14385. const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
  14386. unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
  14387. unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
  14388. if (MaxSVESize && MinSVESize == MaxSVESize) {
  14389. unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
  14390. unsigned PatNumElts =
  14391. getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
  14392. return PatNumElts == (NumElts * VScale);
  14393. }
  14394. }
  14395. return false;
  14396. }
  14397. static SDValue performReinterpretCastCombine(SDNode *N) {
  14398. SDValue LeafOp = SDValue(N, 0);
  14399. SDValue Op = N->getOperand(0);
  14400. while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
  14401. LeafOp.getValueType() != Op.getValueType())
  14402. Op = Op->getOperand(0);
  14403. if (LeafOp.getValueType() == Op.getValueType())
  14404. return Op;
  14405. return SDValue();
  14406. }
  14407. static SDValue performSVEAndCombine(SDNode *N,
  14408. TargetLowering::DAGCombinerInfo &DCI) {
  14409. if (DCI.isBeforeLegalizeOps())
  14410. return SDValue();
  14411. SelectionDAG &DAG = DCI.DAG;
  14412. SDValue Src = N->getOperand(0);
  14413. unsigned Opc = Src->getOpcode();
  14414. // Zero/any extend of an unsigned unpack
  14415. if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
  14416. SDValue UnpkOp = Src->getOperand(0);
  14417. SDValue Dup = N->getOperand(1);
  14418. if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
  14419. return SDValue();
  14420. SDLoc DL(N);
  14421. ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
  14422. if (!C)
  14423. return SDValue();
  14424. uint64_t ExtVal = C->getZExtValue();
  14425. // If the mask is fully covered by the unpack, we don't need to push
  14426. // a new AND onto the operand
  14427. EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
  14428. if ((ExtVal == 0xFF && EltTy == MVT::i8) ||
  14429. (ExtVal == 0xFFFF && EltTy == MVT::i16) ||
  14430. (ExtVal == 0xFFFFFFFF && EltTy == MVT::i32))
  14431. return Src;
  14432. // Truncate to prevent a DUP with an over wide constant
  14433. APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
  14434. // Otherwise, make sure we propagate the AND to the operand
  14435. // of the unpack
  14436. Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
  14437. DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
  14438. SDValue And = DAG.getNode(ISD::AND, DL,
  14439. UnpkOp->getValueType(0), UnpkOp, Dup);
  14440. return DAG.getNode(Opc, DL, N->getValueType(0), And);
  14441. }
  14442. // If both sides of AND operations are i1 splat_vectors then
  14443. // we can produce just i1 splat_vector as the result.
  14444. if (isAllActivePredicate(DAG, N->getOperand(0)))
  14445. return N->getOperand(1);
  14446. if (isAllActivePredicate(DAG, N->getOperand(1)))
  14447. return N->getOperand(0);
  14448. if (!EnableCombineMGatherIntrinsics)
  14449. return SDValue();
  14450. SDValue Mask = N->getOperand(1);
  14451. if (!Src.hasOneUse())
  14452. return SDValue();
  14453. EVT MemVT;
  14454. // SVE load instructions perform an implicit zero-extend, which makes them
  14455. // perfect candidates for combining.
  14456. switch (Opc) {
  14457. case AArch64ISD::LD1_MERGE_ZERO:
  14458. case AArch64ISD::LDNF1_MERGE_ZERO:
  14459. case AArch64ISD::LDFF1_MERGE_ZERO:
  14460. MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
  14461. break;
  14462. case AArch64ISD::GLD1_MERGE_ZERO:
  14463. case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
  14464. case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
  14465. case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
  14466. case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
  14467. case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
  14468. case AArch64ISD::GLD1_IMM_MERGE_ZERO:
  14469. case AArch64ISD::GLDFF1_MERGE_ZERO:
  14470. case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
  14471. case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
  14472. case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
  14473. case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
  14474. case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
  14475. case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
  14476. case AArch64ISD::GLDNT1_MERGE_ZERO:
  14477. MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
  14478. break;
  14479. default:
  14480. return SDValue();
  14481. }
  14482. if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
  14483. return Src;
  14484. return SDValue();
  14485. }
  14486. static SDValue performANDCombine(SDNode *N,
  14487. TargetLowering::DAGCombinerInfo &DCI) {
  14488. SelectionDAG &DAG = DCI.DAG;
  14489. SDValue LHS = N->getOperand(0);
  14490. SDValue RHS = N->getOperand(1);
  14491. EVT VT = N->getValueType(0);
  14492. if (SDValue R = performANDORCSELCombine(N, DAG))
  14493. return R;
  14494. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
  14495. return SDValue();
  14496. if (VT.isScalableVector())
  14497. return performSVEAndCombine(N, DCI);
  14498. // The combining code below works only for NEON vectors. In particular, it
  14499. // does not work for SVE when dealing with vectors wider than 128 bits.
  14500. if (!VT.is64BitVector() && !VT.is128BitVector())
  14501. return SDValue();
  14502. BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
  14503. if (!BVN)
  14504. return SDValue();
  14505. // AND does not accept an immediate, so check if we can use a BIC immediate
  14506. // instruction instead. We do this here instead of using a (and x, (mvni imm))
  14507. // pattern in isel, because some immediates may be lowered to the preferred
  14508. // (and x, (movi imm)) form, even though an mvni representation also exists.
  14509. APInt DefBits(VT.getSizeInBits(), 0);
  14510. APInt UndefBits(VT.getSizeInBits(), 0);
  14511. if (resolveBuildVector(BVN, DefBits, UndefBits)) {
  14512. SDValue NewOp;
  14513. DefBits = ~DefBits;
  14514. if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
  14515. DefBits, &LHS)) ||
  14516. (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
  14517. DefBits, &LHS)))
  14518. return NewOp;
  14519. UndefBits = ~UndefBits;
  14520. if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
  14521. UndefBits, &LHS)) ||
  14522. (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
  14523. UndefBits, &LHS)))
  14524. return NewOp;
  14525. }
  14526. return SDValue();
  14527. }
  14528. static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
  14529. switch (Opcode) {
  14530. case ISD::STRICT_FADD:
  14531. case ISD::FADD:
  14532. return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
  14533. case ISD::ADD:
  14534. return VT == MVT::i64;
  14535. default:
  14536. return false;
  14537. }
  14538. }
  14539. static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
  14540. AArch64CC::CondCode Cond);
  14541. static bool isPredicateCCSettingOp(SDValue N) {
  14542. if ((N.getOpcode() == ISD::SETCC) ||
  14543. (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
  14544. (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
  14545. N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
  14546. N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
  14547. N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
  14548. N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
  14549. N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
  14550. N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
  14551. N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
  14552. // get_active_lane_mask is lowered to a whilelo instruction.
  14553. N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
  14554. return true;
  14555. return false;
  14556. }
  14557. // Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
  14558. // ... into: "ptrue p, all" + PTEST
  14559. static SDValue
  14560. performFirstTrueTestVectorCombine(SDNode *N,
  14561. TargetLowering::DAGCombinerInfo &DCI,
  14562. const AArch64Subtarget *Subtarget) {
  14563. assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
  14564. // Make sure PTEST can be legalised with illegal types.
  14565. if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
  14566. return SDValue();
  14567. SDValue N0 = N->getOperand(0);
  14568. EVT VT = N0.getValueType();
  14569. if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
  14570. !isNullConstant(N->getOperand(1)))
  14571. return SDValue();
  14572. // Restricted the DAG combine to only cases where we're extracting from a
  14573. // flag-setting operation.
  14574. if (!isPredicateCCSettingOp(N0))
  14575. return SDValue();
  14576. // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
  14577. SelectionDAG &DAG = DCI.DAG;
  14578. SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
  14579. return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
  14580. }
  14581. // Materialize : Idx = (add (mul vscale, NumEls), -1)
  14582. // i1 = extract_vector_elt t37, Constant:i64<Idx>
  14583. // ... into: "ptrue p, all" + PTEST
  14584. static SDValue
  14585. performLastTrueTestVectorCombine(SDNode *N,
  14586. TargetLowering::DAGCombinerInfo &DCI,
  14587. const AArch64Subtarget *Subtarget) {
  14588. assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
  14589. // Make sure PTEST is legal types.
  14590. if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
  14591. return SDValue();
  14592. SDValue N0 = N->getOperand(0);
  14593. EVT OpVT = N0.getValueType();
  14594. if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
  14595. return SDValue();
  14596. // Idx == (add (mul vscale, NumEls), -1)
  14597. SDValue Idx = N->getOperand(1);
  14598. if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
  14599. return SDValue();
  14600. SDValue VS = Idx.getOperand(0);
  14601. if (VS.getOpcode() != ISD::VSCALE)
  14602. return SDValue();
  14603. unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
  14604. if (VS.getConstantOperandVal(0) != NumEls)
  14605. return SDValue();
  14606. // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
  14607. SelectionDAG &DAG = DCI.DAG;
  14608. SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
  14609. return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
  14610. }
  14611. static SDValue
  14612. performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
  14613. const AArch64Subtarget *Subtarget) {
  14614. assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
  14615. if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
  14616. return Res;
  14617. if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
  14618. return Res;
  14619. SelectionDAG &DAG = DCI.DAG;
  14620. SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
  14621. ConstantSDNode *ConstantN1 = dyn_cast<ConstantSDNode>(N1);
  14622. EVT VT = N->getValueType(0);
  14623. const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
  14624. bool IsStrict = N0->isStrictFPOpcode();
  14625. // extract(dup x) -> x
  14626. if (N0.getOpcode() == AArch64ISD::DUP)
  14627. return DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
  14628. // Rewrite for pairwise fadd pattern
  14629. // (f32 (extract_vector_elt
  14630. // (fadd (vXf32 Other)
  14631. // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
  14632. // ->
  14633. // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
  14634. // (extract_vector_elt (vXf32 Other) 1))
  14635. // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
  14636. // we can only do this when it's used only by the extract_vector_elt.
  14637. if (ConstantN1 && ConstantN1->getZExtValue() == 0 &&
  14638. hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
  14639. (!IsStrict || N0.hasOneUse())) {
  14640. SDLoc DL(N0);
  14641. SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
  14642. SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
  14643. ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
  14644. SDValue Other = N00;
  14645. // And handle the commutative case.
  14646. if (!Shuffle) {
  14647. Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
  14648. Other = N01;
  14649. }
  14650. if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
  14651. Other == Shuffle->getOperand(0)) {
  14652. SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
  14653. DAG.getConstant(0, DL, MVT::i64));
  14654. SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
  14655. DAG.getConstant(1, DL, MVT::i64));
  14656. if (!IsStrict)
  14657. return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
  14658. // For strict_fadd we need uses of the final extract_vector to be replaced
  14659. // with the strict_fadd, but we also need uses of the chain output of the
  14660. // original strict_fadd to use the chain output of the new strict_fadd as
  14661. // otherwise it may not be deleted.
  14662. SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
  14663. {VT, MVT::Other},
  14664. {N0->getOperand(0), Extract1, Extract2});
  14665. DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
  14666. DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
  14667. return SDValue(N, 0);
  14668. }
  14669. }
  14670. return SDValue();
  14671. }
  14672. static SDValue performConcatVectorsCombine(SDNode *N,
  14673. TargetLowering::DAGCombinerInfo &DCI,
  14674. SelectionDAG &DAG) {
  14675. SDLoc dl(N);
  14676. EVT VT = N->getValueType(0);
  14677. SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
  14678. unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
  14679. if (VT.isScalableVector())
  14680. return SDValue();
  14681. // Optimize concat_vectors of truncated vectors, where the intermediate
  14682. // type is illegal, to avoid said illegality, e.g.,
  14683. // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
  14684. // (v2i16 (truncate (v2i64)))))
  14685. // ->
  14686. // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
  14687. // (v4i32 (bitcast (v2i64))),
  14688. // <0, 2, 4, 6>)))
  14689. // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
  14690. // on both input and result type, so we might generate worse code.
  14691. // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
  14692. if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
  14693. N1Opc == ISD::TRUNCATE) {
  14694. SDValue N00 = N0->getOperand(0);
  14695. SDValue N10 = N1->getOperand(0);
  14696. EVT N00VT = N00.getValueType();
  14697. if (N00VT == N10.getValueType() &&
  14698. (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
  14699. N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
  14700. MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
  14701. SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
  14702. for (size_t i = 0; i < Mask.size(); ++i)
  14703. Mask[i] = i * 2;
  14704. return DAG.getNode(ISD::TRUNCATE, dl, VT,
  14705. DAG.getVectorShuffle(
  14706. MidVT, dl,
  14707. DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
  14708. DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
  14709. }
  14710. }
  14711. if (N->getOperand(0).getValueType() == MVT::v4i8) {
  14712. // If we have a concat of v4i8 loads, convert them to a buildvector of f32
  14713. // loads to prevent having to go through the v4i8 load legalization that
  14714. // needs to extend each element into a larger type.
  14715. if (N->getNumOperands() % 2 == 0 && all_of(N->op_values(), [](SDValue V) {
  14716. if (V.getValueType() != MVT::v4i8)
  14717. return false;
  14718. if (V.isUndef())
  14719. return true;
  14720. LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
  14721. return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
  14722. LD->getExtensionType() == ISD::NON_EXTLOAD;
  14723. })) {
  14724. EVT NVT =
  14725. EVT::getVectorVT(*DAG.getContext(), MVT::f32, N->getNumOperands());
  14726. SmallVector<SDValue> Ops;
  14727. for (unsigned i = 0; i < N->getNumOperands(); i++) {
  14728. SDValue V = N->getOperand(i);
  14729. if (V.isUndef())
  14730. Ops.push_back(DAG.getUNDEF(MVT::f32));
  14731. else {
  14732. LoadSDNode *LD = cast<LoadSDNode>(V);
  14733. SDValue NewLoad =
  14734. DAG.getLoad(MVT::f32, dl, LD->getChain(), LD->getBasePtr(),
  14735. LD->getMemOperand());
  14736. DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
  14737. Ops.push_back(NewLoad);
  14738. }
  14739. }
  14740. return DAG.getBitcast(N->getValueType(0),
  14741. DAG.getBuildVector(NVT, dl, Ops));
  14742. }
  14743. }
  14744. // Canonicalise concat_vectors to replace concatenations of truncated nots
  14745. // with nots of concatenated truncates. This in some cases allows for multiple
  14746. // redundant negations to be eliminated.
  14747. // (concat_vectors (v4i16 (truncate (not (v4i32)))),
  14748. // (v4i16 (truncate (not (v4i32)))))
  14749. // ->
  14750. // (not (concat_vectors (v4i16 (truncate (v4i32))),
  14751. // (v4i16 (truncate (v4i32)))))
  14752. if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
  14753. N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
  14754. N->isOnlyUserOf(N1.getNode())) {
  14755. auto isBitwiseVectorNegate = [](SDValue V) {
  14756. return V->getOpcode() == ISD::XOR &&
  14757. ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
  14758. };
  14759. SDValue N00 = N0->getOperand(0);
  14760. SDValue N10 = N1->getOperand(0);
  14761. if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
  14762. isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
  14763. return DAG.getNOT(
  14764. dl,
  14765. DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
  14766. DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(),
  14767. N00->getOperand(0)),
  14768. DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(),
  14769. N10->getOperand(0))),
  14770. VT);
  14771. }
  14772. }
  14773. // Wait till after everything is legalized to try this. That way we have
  14774. // legal vector types and such.
  14775. if (DCI.isBeforeLegalizeOps())
  14776. return SDValue();
  14777. // Optimise concat_vectors of two [us]avgceils or [us]avgfloors that use
  14778. // extracted subvectors from the same original vectors. Combine these into a
  14779. // single avg that operates on the two original vectors.
  14780. // avgceil is the target independant name for rhadd, avgfloor is a hadd.
  14781. // Example:
  14782. // (concat_vectors (v8i8 (avgceils (extract_subvector (v16i8 OpA, <0>),
  14783. // extract_subvector (v16i8 OpB, <0>))),
  14784. // (v8i8 (avgceils (extract_subvector (v16i8 OpA, <8>),
  14785. // extract_subvector (v16i8 OpB, <8>)))))
  14786. // ->
  14787. // (v16i8(avgceils(v16i8 OpA, v16i8 OpB)))
  14788. if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
  14789. (N0Opc == ISD::AVGCEILU || N0Opc == ISD::AVGCEILS ||
  14790. N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS)) {
  14791. SDValue N00 = N0->getOperand(0);
  14792. SDValue N01 = N0->getOperand(1);
  14793. SDValue N10 = N1->getOperand(0);
  14794. SDValue N11 = N1->getOperand(1);
  14795. EVT N00VT = N00.getValueType();
  14796. EVT N10VT = N10.getValueType();
  14797. if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
  14798. N01->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
  14799. N10->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
  14800. N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) {
  14801. SDValue N00Source = N00->getOperand(0);
  14802. SDValue N01Source = N01->getOperand(0);
  14803. SDValue N10Source = N10->getOperand(0);
  14804. SDValue N11Source = N11->getOperand(0);
  14805. if (N00Source == N10Source && N01Source == N11Source &&
  14806. N00Source.getValueType() == VT && N01Source.getValueType() == VT) {
  14807. assert(N0.getValueType() == N1.getValueType());
  14808. uint64_t N00Index = N00.getConstantOperandVal(1);
  14809. uint64_t N01Index = N01.getConstantOperandVal(1);
  14810. uint64_t N10Index = N10.getConstantOperandVal(1);
  14811. uint64_t N11Index = N11.getConstantOperandVal(1);
  14812. if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 &&
  14813. N10Index == N00VT.getVectorNumElements())
  14814. return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source);
  14815. }
  14816. }
  14817. }
  14818. // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
  14819. // splat. The indexed instructions are going to be expecting a DUPLANE64, so
  14820. // canonicalise to that.
  14821. if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
  14822. assert(VT.getScalarSizeInBits() == 64);
  14823. return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
  14824. DAG.getConstant(0, dl, MVT::i64));
  14825. }
  14826. // Canonicalise concat_vectors so that the right-hand vector has as few
  14827. // bit-casts as possible before its real operation. The primary matching
  14828. // destination for these operations will be the narrowing "2" instructions,
  14829. // which depend on the operation being performed on this right-hand vector.
  14830. // For example,
  14831. // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
  14832. // becomes
  14833. // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
  14834. if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
  14835. return SDValue();
  14836. SDValue RHS = N1->getOperand(0);
  14837. MVT RHSTy = RHS.getValueType().getSimpleVT();
  14838. // If the RHS is not a vector, this is not the pattern we're looking for.
  14839. if (!RHSTy.isVector())
  14840. return SDValue();
  14841. LLVM_DEBUG(
  14842. dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
  14843. MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
  14844. RHSTy.getVectorNumElements() * 2);
  14845. return DAG.getNode(ISD::BITCAST, dl, VT,
  14846. DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
  14847. DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
  14848. RHS));
  14849. }
  14850. static SDValue
  14851. performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
  14852. SelectionDAG &DAG) {
  14853. if (DCI.isBeforeLegalizeOps())
  14854. return SDValue();
  14855. EVT VT = N->getValueType(0);
  14856. if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
  14857. return SDValue();
  14858. SDValue V = N->getOperand(0);
  14859. // NOTE: This combine exists in DAGCombiner, but that version's legality check
  14860. // blocks this combine because the non-const case requires custom lowering.
  14861. //
  14862. // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
  14863. if (V.getOpcode() == ISD::SPLAT_VECTOR)
  14864. if (isa<ConstantSDNode>(V.getOperand(0)))
  14865. return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
  14866. return SDValue();
  14867. }
  14868. static SDValue
  14869. performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
  14870. SelectionDAG &DAG) {
  14871. SDLoc DL(N);
  14872. SDValue Vec = N->getOperand(0);
  14873. SDValue SubVec = N->getOperand(1);
  14874. uint64_t IdxVal = N->getConstantOperandVal(2);
  14875. EVT VecVT = Vec.getValueType();
  14876. EVT SubVT = SubVec.getValueType();
  14877. // Only do this for legal fixed vector types.
  14878. if (!VecVT.isFixedLengthVector() ||
  14879. !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
  14880. !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
  14881. return SDValue();
  14882. // Ignore widening patterns.
  14883. if (IdxVal == 0 && Vec.isUndef())
  14884. return SDValue();
  14885. // Subvector must be half the width and an "aligned" insertion.
  14886. unsigned NumSubElts = SubVT.getVectorNumElements();
  14887. if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
  14888. (IdxVal != 0 && IdxVal != NumSubElts))
  14889. return SDValue();
  14890. // Fold insert_subvector -> concat_vectors
  14891. // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
  14892. // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
  14893. SDValue Lo, Hi;
  14894. if (IdxVal == 0) {
  14895. Lo = SubVec;
  14896. Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
  14897. DAG.getVectorIdxConstant(NumSubElts, DL));
  14898. } else {
  14899. Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
  14900. DAG.getVectorIdxConstant(0, DL));
  14901. Hi = SubVec;
  14902. }
  14903. return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
  14904. }
  14905. static SDValue tryCombineFixedPointConvert(SDNode *N,
  14906. TargetLowering::DAGCombinerInfo &DCI,
  14907. SelectionDAG &DAG) {
  14908. // Wait until after everything is legalized to try this. That way we have
  14909. // legal vector types and such.
  14910. if (DCI.isBeforeLegalizeOps())
  14911. return SDValue();
  14912. // Transform a scalar conversion of a value from a lane extract into a
  14913. // lane extract of a vector conversion. E.g., from foo1 to foo2:
  14914. // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
  14915. // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
  14916. //
  14917. // The second form interacts better with instruction selection and the
  14918. // register allocator to avoid cross-class register copies that aren't
  14919. // coalescable due to a lane reference.
  14920. // Check the operand and see if it originates from a lane extract.
  14921. SDValue Op1 = N->getOperand(1);
  14922. if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
  14923. return SDValue();
  14924. // Yep, no additional predication needed. Perform the transform.
  14925. SDValue IID = N->getOperand(0);
  14926. SDValue Shift = N->getOperand(2);
  14927. SDValue Vec = Op1.getOperand(0);
  14928. SDValue Lane = Op1.getOperand(1);
  14929. EVT ResTy = N->getValueType(0);
  14930. EVT VecResTy;
  14931. SDLoc DL(N);
  14932. // The vector width should be 128 bits by the time we get here, even
  14933. // if it started as 64 bits (the extract_vector handling will have
  14934. // done so). Bail if it is not.
  14935. if (Vec.getValueSizeInBits() != 128)
  14936. return SDValue();
  14937. if (Vec.getValueType() == MVT::v4i32)
  14938. VecResTy = MVT::v4f32;
  14939. else if (Vec.getValueType() == MVT::v2i64)
  14940. VecResTy = MVT::v2f64;
  14941. else
  14942. return SDValue();
  14943. SDValue Convert =
  14944. DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
  14945. return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
  14946. }
  14947. // AArch64 high-vector "long" operations are formed by performing the non-high
  14948. // version on an extract_subvector of each operand which gets the high half:
  14949. //
  14950. // (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
  14951. //
  14952. // However, there are cases which don't have an extract_high explicitly, but
  14953. // have another operation that can be made compatible with one for free. For
  14954. // example:
  14955. //
  14956. // (dupv64 scalar) --> (extract_high (dup128 scalar))
  14957. //
  14958. // This routine does the actual conversion of such DUPs, once outer routines
  14959. // have determined that everything else is in order.
  14960. // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
  14961. // similarly here.
  14962. static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
  14963. MVT VT = N.getSimpleValueType();
  14964. if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
  14965. N.getConstantOperandVal(1) == 0)
  14966. N = N.getOperand(0);
  14967. switch (N.getOpcode()) {
  14968. case AArch64ISD::DUP:
  14969. case AArch64ISD::DUPLANE8:
  14970. case AArch64ISD::DUPLANE16:
  14971. case AArch64ISD::DUPLANE32:
  14972. case AArch64ISD::DUPLANE64:
  14973. case AArch64ISD::MOVI:
  14974. case AArch64ISD::MOVIshift:
  14975. case AArch64ISD::MOVIedit:
  14976. case AArch64ISD::MOVImsl:
  14977. case AArch64ISD::MVNIshift:
  14978. case AArch64ISD::MVNImsl:
  14979. break;
  14980. default:
  14981. // FMOV could be supported, but isn't very useful, as it would only occur
  14982. // if you passed a bitcast' floating point immediate to an eligible long
  14983. // integer op (addl, smull, ...).
  14984. return SDValue();
  14985. }
  14986. if (!VT.is64BitVector())
  14987. return SDValue();
  14988. SDLoc DL(N);
  14989. unsigned NumElems = VT.getVectorNumElements();
  14990. if (N.getValueType().is64BitVector()) {
  14991. MVT ElementTy = VT.getVectorElementType();
  14992. MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
  14993. N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
  14994. }
  14995. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
  14996. DAG.getConstant(NumElems, DL, MVT::i64));
  14997. }
  14998. static bool isEssentiallyExtractHighSubvector(SDValue N) {
  14999. if (N.getOpcode() == ISD::BITCAST)
  15000. N = N.getOperand(0);
  15001. if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
  15002. return false;
  15003. if (N.getOperand(0).getValueType().isScalableVector())
  15004. return false;
  15005. return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
  15006. N.getOperand(0).getValueType().getVectorNumElements() / 2;
  15007. }
  15008. /// Helper structure to keep track of ISD::SET_CC operands.
  15009. struct GenericSetCCInfo {
  15010. const SDValue *Opnd0;
  15011. const SDValue *Opnd1;
  15012. ISD::CondCode CC;
  15013. };
  15014. /// Helper structure to keep track of a SET_CC lowered into AArch64 code.
  15015. struct AArch64SetCCInfo {
  15016. const SDValue *Cmp;
  15017. AArch64CC::CondCode CC;
  15018. };
  15019. /// Helper structure to keep track of SetCC information.
  15020. union SetCCInfo {
  15021. GenericSetCCInfo Generic;
  15022. AArch64SetCCInfo AArch64;
  15023. };
  15024. /// Helper structure to be able to read SetCC information. If set to
  15025. /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
  15026. /// GenericSetCCInfo.
  15027. struct SetCCInfoAndKind {
  15028. SetCCInfo Info;
  15029. bool IsAArch64;
  15030. };
  15031. /// Check whether or not \p Op is a SET_CC operation, either a generic or
  15032. /// an
  15033. /// AArch64 lowered one.
  15034. /// \p SetCCInfo is filled accordingly.
  15035. /// \post SetCCInfo is meanginfull only when this function returns true.
  15036. /// \return True when Op is a kind of SET_CC operation.
  15037. static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
  15038. // If this is a setcc, this is straight forward.
  15039. if (Op.getOpcode() == ISD::SETCC) {
  15040. SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
  15041. SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
  15042. SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
  15043. SetCCInfo.IsAArch64 = false;
  15044. return true;
  15045. }
  15046. // Otherwise, check if this is a matching csel instruction.
  15047. // In other words:
  15048. // - csel 1, 0, cc
  15049. // - csel 0, 1, !cc
  15050. if (Op.getOpcode() != AArch64ISD::CSEL)
  15051. return false;
  15052. // Set the information about the operands.
  15053. // TODO: we want the operands of the Cmp not the csel
  15054. SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
  15055. SetCCInfo.IsAArch64 = true;
  15056. SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
  15057. cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
  15058. // Check that the operands matches the constraints:
  15059. // (1) Both operands must be constants.
  15060. // (2) One must be 1 and the other must be 0.
  15061. ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
  15062. ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
  15063. // Check (1).
  15064. if (!TValue || !FValue)
  15065. return false;
  15066. // Check (2).
  15067. if (!TValue->isOne()) {
  15068. // Update the comparison when we are interested in !cc.
  15069. std::swap(TValue, FValue);
  15070. SetCCInfo.Info.AArch64.CC =
  15071. AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
  15072. }
  15073. return TValue->isOne() && FValue->isZero();
  15074. }
  15075. // Returns true if Op is setcc or zext of setcc.
  15076. static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
  15077. if (isSetCC(Op, Info))
  15078. return true;
  15079. return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
  15080. isSetCC(Op->getOperand(0), Info));
  15081. }
  15082. // The folding we want to perform is:
  15083. // (add x, [zext] (setcc cc ...) )
  15084. // -->
  15085. // (csel x, (add x, 1), !cc ...)
  15086. //
  15087. // The latter will get matched to a CSINC instruction.
  15088. static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
  15089. assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
  15090. SDValue LHS = Op->getOperand(0);
  15091. SDValue RHS = Op->getOperand(1);
  15092. SetCCInfoAndKind InfoAndKind;
  15093. // If both operands are a SET_CC, then we don't want to perform this
  15094. // folding and create another csel as this results in more instructions
  15095. // (and higher register usage).
  15096. if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
  15097. isSetCCOrZExtSetCC(RHS, InfoAndKind))
  15098. return SDValue();
  15099. // If neither operand is a SET_CC, give up.
  15100. if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
  15101. std::swap(LHS, RHS);
  15102. if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
  15103. return SDValue();
  15104. }
  15105. // FIXME: This could be generatized to work for FP comparisons.
  15106. EVT CmpVT = InfoAndKind.IsAArch64
  15107. ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
  15108. : InfoAndKind.Info.Generic.Opnd0->getValueType();
  15109. if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
  15110. return SDValue();
  15111. SDValue CCVal;
  15112. SDValue Cmp;
  15113. SDLoc dl(Op);
  15114. if (InfoAndKind.IsAArch64) {
  15115. CCVal = DAG.getConstant(
  15116. AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
  15117. MVT::i32);
  15118. Cmp = *InfoAndKind.Info.AArch64.Cmp;
  15119. } else
  15120. Cmp = getAArch64Cmp(
  15121. *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
  15122. ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
  15123. dl);
  15124. EVT VT = Op->getValueType(0);
  15125. LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
  15126. return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
  15127. }
  15128. // ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
  15129. static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) {
  15130. EVT VT = N->getValueType(0);
  15131. // Only scalar integer and vector types.
  15132. if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
  15133. return SDValue();
  15134. SDValue LHS = N->getOperand(0);
  15135. SDValue RHS = N->getOperand(1);
  15136. if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
  15137. RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
  15138. return SDValue();
  15139. auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
  15140. auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
  15141. if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
  15142. return SDValue();
  15143. SDValue Op1 = LHS->getOperand(0);
  15144. SDValue Op2 = RHS->getOperand(0);
  15145. EVT OpVT1 = Op1.getValueType();
  15146. EVT OpVT2 = Op2.getValueType();
  15147. if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
  15148. Op2.getOpcode() != AArch64ISD::UADDV ||
  15149. OpVT1.getVectorElementType() != VT)
  15150. return SDValue();
  15151. SDValue Val1 = Op1.getOperand(0);
  15152. SDValue Val2 = Op2.getOperand(0);
  15153. EVT ValVT = Val1->getValueType(0);
  15154. SDLoc DL(N);
  15155. SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
  15156. return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
  15157. DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
  15158. DAG.getConstant(0, DL, MVT::i64));
  15159. }
  15160. /// Perform the scalar expression combine in the form of:
  15161. /// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
  15162. /// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
  15163. static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG) {
  15164. EVT VT = N->getValueType(0);
  15165. if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
  15166. return SDValue();
  15167. SDValue LHS = N->getOperand(0);
  15168. SDValue RHS = N->getOperand(1);
  15169. // Handle commutivity.
  15170. if (LHS.getOpcode() != AArch64ISD::CSEL &&
  15171. LHS.getOpcode() != AArch64ISD::CSNEG) {
  15172. std::swap(LHS, RHS);
  15173. if (LHS.getOpcode() != AArch64ISD::CSEL &&
  15174. LHS.getOpcode() != AArch64ISD::CSNEG) {
  15175. return SDValue();
  15176. }
  15177. }
  15178. if (!LHS.hasOneUse())
  15179. return SDValue();
  15180. AArch64CC::CondCode AArch64CC =
  15181. static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
  15182. // The CSEL should include a const one operand, and the CSNEG should include
  15183. // One or NegOne operand.
  15184. ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
  15185. ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
  15186. if (!CTVal || !CFVal)
  15187. return SDValue();
  15188. if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
  15189. (CTVal->isOne() || CFVal->isOne())) &&
  15190. !(LHS.getOpcode() == AArch64ISD::CSNEG &&
  15191. (CTVal->isOne() || CFVal->isAllOnes())))
  15192. return SDValue();
  15193. // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
  15194. if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
  15195. !CFVal->isOne()) {
  15196. std::swap(CTVal, CFVal);
  15197. AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
  15198. }
  15199. SDLoc DL(N);
  15200. // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
  15201. if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
  15202. !CFVal->isAllOnes()) {
  15203. APInt C = -1 * CFVal->getAPIntValue();
  15204. CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
  15205. CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
  15206. AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
  15207. }
  15208. // It might be neutral for larger constants, as the immediate need to be
  15209. // materialized in a register.
  15210. APInt ADDC = CTVal->getAPIntValue();
  15211. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  15212. if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
  15213. return SDValue();
  15214. assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
  15215. (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
  15216. "Unexpected constant value");
  15217. SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
  15218. SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
  15219. SDValue Cmp = LHS.getOperand(3);
  15220. return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
  15221. }
  15222. // ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
  15223. static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
  15224. EVT VT = N->getValueType(0);
  15225. if (N->getOpcode() != ISD::ADD)
  15226. return SDValue();
  15227. SDValue Dot = N->getOperand(0);
  15228. SDValue A = N->getOperand(1);
  15229. // Handle commutivity
  15230. auto isZeroDot = [](SDValue Dot) {
  15231. return (Dot.getOpcode() == AArch64ISD::UDOT ||
  15232. Dot.getOpcode() == AArch64ISD::SDOT) &&
  15233. isZerosVector(Dot.getOperand(0).getNode());
  15234. };
  15235. if (!isZeroDot(Dot))
  15236. std::swap(Dot, A);
  15237. if (!isZeroDot(Dot))
  15238. return SDValue();
  15239. return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
  15240. Dot.getOperand(2));
  15241. }
  15242. static bool isNegatedInteger(SDValue Op) {
  15243. return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
  15244. }
  15245. static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) {
  15246. SDLoc DL(Op);
  15247. EVT VT = Op.getValueType();
  15248. SDValue Zero = DAG.getConstant(0, DL, VT);
  15249. return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
  15250. }
  15251. // Try to fold
  15252. //
  15253. // (neg (csel X, Y)) -> (csel (neg X), (neg Y))
  15254. //
  15255. // The folding helps csel to be matched with csneg without generating
  15256. // redundant neg instruction, which includes negation of the csel expansion
  15257. // of abs node lowered by lowerABS.
  15258. static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) {
  15259. if (!isNegatedInteger(SDValue(N, 0)))
  15260. return SDValue();
  15261. SDValue CSel = N->getOperand(1);
  15262. if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
  15263. return SDValue();
  15264. SDValue N0 = CSel.getOperand(0);
  15265. SDValue N1 = CSel.getOperand(1);
  15266. // If both of them is not negations, it's not worth the folding as it
  15267. // introduces two additional negations while reducing one negation.
  15268. if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
  15269. return SDValue();
  15270. SDValue N0N = getNegatedInteger(N0, DAG);
  15271. SDValue N1N = getNegatedInteger(N1, DAG);
  15272. SDLoc DL(N);
  15273. EVT VT = CSel.getValueType();
  15274. return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
  15275. CSel.getOperand(3));
  15276. }
  15277. // The basic add/sub long vector instructions have variants with "2" on the end
  15278. // which act on the high-half of their inputs. They are normally matched by
  15279. // patterns like:
  15280. //
  15281. // (add (zeroext (extract_high LHS)),
  15282. // (zeroext (extract_high RHS)))
  15283. // -> uaddl2 vD, vN, vM
  15284. //
  15285. // However, if one of the extracts is something like a duplicate, this
  15286. // instruction can still be used profitably. This function puts the DAG into a
  15287. // more appropriate form for those patterns to trigger.
  15288. static SDValue performAddSubLongCombine(SDNode *N,
  15289. TargetLowering::DAGCombinerInfo &DCI,
  15290. SelectionDAG &DAG) {
  15291. if (DCI.isBeforeLegalizeOps())
  15292. return SDValue();
  15293. MVT VT = N->getSimpleValueType(0);
  15294. if (!VT.is128BitVector()) {
  15295. if (N->getOpcode() == ISD::ADD)
  15296. return performSetccAddFolding(N, DAG);
  15297. return SDValue();
  15298. }
  15299. // Make sure both branches are extended in the same way.
  15300. SDValue LHS = N->getOperand(0);
  15301. SDValue RHS = N->getOperand(1);
  15302. if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
  15303. LHS.getOpcode() != ISD::SIGN_EXTEND) ||
  15304. LHS.getOpcode() != RHS.getOpcode())
  15305. return SDValue();
  15306. unsigned ExtType = LHS.getOpcode();
  15307. // It's not worth doing if at least one of the inputs isn't already an
  15308. // extract, but we don't know which it'll be so we have to try both.
  15309. if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
  15310. RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
  15311. if (!RHS.getNode())
  15312. return SDValue();
  15313. RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
  15314. } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
  15315. LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
  15316. if (!LHS.getNode())
  15317. return SDValue();
  15318. LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
  15319. }
  15320. return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
  15321. }
  15322. static bool isCMP(SDValue Op) {
  15323. return Op.getOpcode() == AArch64ISD::SUBS &&
  15324. !Op.getNode()->hasAnyUseOfValue(0);
  15325. }
  15326. // (CSEL 1 0 CC Cond) => CC
  15327. // (CSEL 0 1 CC Cond) => !CC
  15328. static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
  15329. if (Op.getOpcode() != AArch64ISD::CSEL)
  15330. return std::nullopt;
  15331. auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
  15332. if (CC == AArch64CC::AL || CC == AArch64CC::NV)
  15333. return std::nullopt;
  15334. SDValue OpLHS = Op.getOperand(0);
  15335. SDValue OpRHS = Op.getOperand(1);
  15336. if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
  15337. return CC;
  15338. if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
  15339. return getInvertedCondCode(CC);
  15340. return std::nullopt;
  15341. }
  15342. // (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
  15343. // (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
  15344. static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
  15345. SDValue CmpOp = Op->getOperand(2);
  15346. if (!isCMP(CmpOp))
  15347. return SDValue();
  15348. if (IsAdd) {
  15349. if (!isOneConstant(CmpOp.getOperand(1)))
  15350. return SDValue();
  15351. } else {
  15352. if (!isNullConstant(CmpOp.getOperand(0)))
  15353. return SDValue();
  15354. }
  15355. SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
  15356. auto CC = getCSETCondCode(CsetOp);
  15357. if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
  15358. return SDValue();
  15359. return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
  15360. Op->getOperand(0), Op->getOperand(1),
  15361. CsetOp.getOperand(3));
  15362. }
  15363. // (ADC x 0 cond) => (CINC x HS cond)
  15364. static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) {
  15365. SDValue LHS = N->getOperand(0);
  15366. SDValue RHS = N->getOperand(1);
  15367. SDValue Cond = N->getOperand(2);
  15368. if (!isNullConstant(RHS))
  15369. return SDValue();
  15370. EVT VT = N->getValueType(0);
  15371. SDLoc DL(N);
  15372. // (CINC x cc cond) <=> (CSINC x x !cc cond)
  15373. SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
  15374. return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
  15375. }
  15376. // Transform vector add(zext i8 to i32, zext i8 to i32)
  15377. // into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
  15378. // This allows extra uses of saddl/uaddl at the lower vector widths, and less
  15379. // extends.
  15380. static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG) {
  15381. EVT VT = N->getValueType(0);
  15382. if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
  15383. (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
  15384. N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
  15385. (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
  15386. N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
  15387. N->getOperand(0).getOperand(0).getValueType() !=
  15388. N->getOperand(1).getOperand(0).getValueType())
  15389. return SDValue();
  15390. SDValue N0 = N->getOperand(0).getOperand(0);
  15391. SDValue N1 = N->getOperand(1).getOperand(0);
  15392. EVT InVT = N0.getValueType();
  15393. EVT S1 = InVT.getScalarType();
  15394. EVT S2 = VT.getScalarType();
  15395. if ((S2 == MVT::i32 && S1 == MVT::i8) ||
  15396. (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
  15397. SDLoc DL(N);
  15398. EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
  15399. S2.getHalfSizedIntegerVT(*DAG.getContext()),
  15400. VT.getVectorElementCount());
  15401. SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
  15402. SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
  15403. SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
  15404. return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp);
  15405. }
  15406. return SDValue();
  15407. }
  15408. static SDValue performBuildVectorCombine(SDNode *N,
  15409. TargetLowering::DAGCombinerInfo &DCI,
  15410. SelectionDAG &DAG) {
  15411. SDLoc DL(N);
  15412. EVT VT = N->getValueType(0);
  15413. // A build vector of two extracted elements is equivalent to an
  15414. // extract subvector where the inner vector is any-extended to the
  15415. // extract_vector_elt VT.
  15416. // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
  15417. // (extract_elt_iXX_to_i32 vec Idx+1))
  15418. // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
  15419. // For now, only consider the v2i32 case, which arises as a result of
  15420. // legalization.
  15421. if (VT != MVT::v2i32)
  15422. return SDValue();
  15423. SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
  15424. // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
  15425. if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
  15426. Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
  15427. // Constant index.
  15428. isa<ConstantSDNode>(Elt0->getOperand(1)) &&
  15429. isa<ConstantSDNode>(Elt1->getOperand(1)) &&
  15430. // Both EXTRACT_VECTOR_ELT from same vector...
  15431. Elt0->getOperand(0) == Elt1->getOperand(0) &&
  15432. // ... and contiguous. First element's index +1 == second element's index.
  15433. Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
  15434. // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
  15435. // ResultType's known minimum vector length.
  15436. Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
  15437. SDValue VecToExtend = Elt0->getOperand(0);
  15438. EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
  15439. if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
  15440. return SDValue();
  15441. SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
  15442. SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
  15443. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
  15444. SubvectorIdx);
  15445. }
  15446. return SDValue();
  15447. }
  15448. static SDValue performTruncateCombine(SDNode *N,
  15449. SelectionDAG &DAG) {
  15450. EVT VT = N->getValueType(0);
  15451. SDValue N0 = N->getOperand(0);
  15452. if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
  15453. N0.getOpcode() == AArch64ISD::DUP) {
  15454. SDValue Op = N0.getOperand(0);
  15455. if (VT.getScalarType() == MVT::i32 &&
  15456. N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
  15457. Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
  15458. return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op);
  15459. }
  15460. return SDValue();
  15461. }
  15462. // Check an node is an extend or shift operand
  15463. static bool isExtendOrShiftOperand(SDValue N) {
  15464. unsigned Opcode = N.getOpcode();
  15465. if (Opcode == ISD::SIGN_EXTEND || Opcode == ISD::SIGN_EXTEND_INREG ||
  15466. Opcode == ISD::ZERO_EXTEND || Opcode == ISD::ANY_EXTEND) {
  15467. EVT SrcVT;
  15468. if (Opcode == ISD::SIGN_EXTEND_INREG)
  15469. SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
  15470. else
  15471. SrcVT = N.getOperand(0).getValueType();
  15472. return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
  15473. } else if (Opcode == ISD::AND) {
  15474. ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
  15475. if (!CSD)
  15476. return false;
  15477. uint64_t AndMask = CSD->getZExtValue();
  15478. return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
  15479. } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
  15480. return isa<ConstantSDNode>(N.getOperand(1));
  15481. }
  15482. return false;
  15483. }
  15484. // (N - Y) + Z --> (Z - Y) + N
  15485. // when N is an extend or shift operand
  15486. static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z,
  15487. SelectionDAG &DAG) {
  15488. auto IsOneUseExtend = [](SDValue N) {
  15489. return N.hasOneUse() && isExtendOrShiftOperand(N);
  15490. };
  15491. // DAGCombiner will revert the combination when Z is constant cause
  15492. // dead loop. So don't enable the combination when Z is constant.
  15493. // If Z is one use shift C, we also can't do the optimization.
  15494. // It will falling to self infinite loop.
  15495. if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
  15496. return SDValue();
  15497. if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
  15498. return SDValue();
  15499. SDValue Shift = SUB.getOperand(0);
  15500. if (!IsOneUseExtend(Shift))
  15501. return SDValue();
  15502. SDLoc DL(N);
  15503. EVT VT = N->getValueType(0);
  15504. SDValue Y = SUB.getOperand(1);
  15505. SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
  15506. return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
  15507. }
  15508. static SDValue performAddCombineForShiftedOperands(SDNode *N,
  15509. SelectionDAG &DAG) {
  15510. // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
  15511. // commutative.
  15512. if (N->getOpcode() != ISD::ADD)
  15513. return SDValue();
  15514. // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
  15515. // shifted register is only available for i32 and i64.
  15516. EVT VT = N->getValueType(0);
  15517. if (VT != MVT::i32 && VT != MVT::i64)
  15518. return SDValue();
  15519. SDLoc DL(N);
  15520. SDValue LHS = N->getOperand(0);
  15521. SDValue RHS = N->getOperand(1);
  15522. if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
  15523. return Val;
  15524. if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
  15525. return Val;
  15526. uint64_t LHSImm = 0, RHSImm = 0;
  15527. // If both operand are shifted by imm and shift amount is not greater than 4
  15528. // for one operand, swap LHS and RHS to put operand with smaller shift amount
  15529. // on RHS.
  15530. //
  15531. // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
  15532. // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
  15533. // with LSL (shift > 4). For the rest of processors, this is no-op for
  15534. // performance or correctness.
  15535. if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
  15536. isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
  15537. RHSImm > 4 && LHS.hasOneUse())
  15538. return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
  15539. return SDValue();
  15540. }
  15541. static SDValue performAddSubCombine(SDNode *N,
  15542. TargetLowering::DAGCombinerInfo &DCI,
  15543. SelectionDAG &DAG) {
  15544. // Try to change sum of two reductions.
  15545. if (SDValue Val = performAddUADDVCombine(N, DAG))
  15546. return Val;
  15547. if (SDValue Val = performAddDotCombine(N, DAG))
  15548. return Val;
  15549. if (SDValue Val = performAddCSelIntoCSinc(N, DAG))
  15550. return Val;
  15551. if (SDValue Val = performNegCSelCombine(N, DAG))
  15552. return Val;
  15553. if (SDValue Val = performVectorAddSubExtCombine(N, DAG))
  15554. return Val;
  15555. if (SDValue Val = performAddCombineForShiftedOperands(N, DAG))
  15556. return Val;
  15557. return performAddSubLongCombine(N, DCI, DAG);
  15558. }
  15559. // Massage DAGs which we can use the high-half "long" operations on into
  15560. // something isel will recognize better. E.g.
  15561. //
  15562. // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
  15563. // (aarch64_neon_umull (extract_high (v2i64 vec)))
  15564. // (extract_high (v2i64 (dup128 scalar)))))
  15565. //
  15566. static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
  15567. TargetLowering::DAGCombinerInfo &DCI,
  15568. SelectionDAG &DAG) {
  15569. if (DCI.isBeforeLegalizeOps())
  15570. return SDValue();
  15571. SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
  15572. SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
  15573. assert(LHS.getValueType().is64BitVector() &&
  15574. RHS.getValueType().is64BitVector() &&
  15575. "unexpected shape for long operation");
  15576. // Either node could be a DUP, but it's not worth doing both of them (you'd
  15577. // just as well use the non-high version) so look for a corresponding extract
  15578. // operation on the other "wing".
  15579. if (isEssentiallyExtractHighSubvector(LHS)) {
  15580. RHS = tryExtendDUPToExtractHigh(RHS, DAG);
  15581. if (!RHS.getNode())
  15582. return SDValue();
  15583. } else if (isEssentiallyExtractHighSubvector(RHS)) {
  15584. LHS = tryExtendDUPToExtractHigh(LHS, DAG);
  15585. if (!LHS.getNode())
  15586. return SDValue();
  15587. }
  15588. if (IID == Intrinsic::not_intrinsic)
  15589. return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
  15590. return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
  15591. N->getOperand(0), LHS, RHS);
  15592. }
  15593. static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
  15594. MVT ElemTy = N->getSimpleValueType(0).getScalarType();
  15595. unsigned ElemBits = ElemTy.getSizeInBits();
  15596. int64_t ShiftAmount;
  15597. if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
  15598. APInt SplatValue, SplatUndef;
  15599. unsigned SplatBitSize;
  15600. bool HasAnyUndefs;
  15601. if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
  15602. HasAnyUndefs, ElemBits) ||
  15603. SplatBitSize != ElemBits)
  15604. return SDValue();
  15605. ShiftAmount = SplatValue.getSExtValue();
  15606. } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
  15607. ShiftAmount = CVN->getSExtValue();
  15608. } else
  15609. return SDValue();
  15610. unsigned Opcode;
  15611. bool IsRightShift;
  15612. switch (IID) {
  15613. default:
  15614. llvm_unreachable("Unknown shift intrinsic");
  15615. case Intrinsic::aarch64_neon_sqshl:
  15616. Opcode = AArch64ISD::SQSHL_I;
  15617. IsRightShift = false;
  15618. break;
  15619. case Intrinsic::aarch64_neon_uqshl:
  15620. Opcode = AArch64ISD::UQSHL_I;
  15621. IsRightShift = false;
  15622. break;
  15623. case Intrinsic::aarch64_neon_srshl:
  15624. Opcode = AArch64ISD::SRSHR_I;
  15625. IsRightShift = true;
  15626. break;
  15627. case Intrinsic::aarch64_neon_urshl:
  15628. Opcode = AArch64ISD::URSHR_I;
  15629. IsRightShift = true;
  15630. break;
  15631. case Intrinsic::aarch64_neon_sqshlu:
  15632. Opcode = AArch64ISD::SQSHLU_I;
  15633. IsRightShift = false;
  15634. break;
  15635. case Intrinsic::aarch64_neon_sshl:
  15636. case Intrinsic::aarch64_neon_ushl:
  15637. // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
  15638. // left shift for positive shift amounts. Below, we only replace the current
  15639. // node with VSHL, if this condition is met.
  15640. Opcode = AArch64ISD::VSHL;
  15641. IsRightShift = false;
  15642. break;
  15643. }
  15644. if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
  15645. SDLoc dl(N);
  15646. return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
  15647. DAG.getConstant(-ShiftAmount, dl, MVT::i32));
  15648. } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
  15649. SDLoc dl(N);
  15650. return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
  15651. DAG.getConstant(ShiftAmount, dl, MVT::i32));
  15652. }
  15653. return SDValue();
  15654. }
  15655. // The CRC32[BH] instructions ignore the high bits of their data operand. Since
  15656. // the intrinsics must be legal and take an i32, this means there's almost
  15657. // certainly going to be a zext in the DAG which we can eliminate.
  15658. static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
  15659. SDValue AndN = N->getOperand(2);
  15660. if (AndN.getOpcode() != ISD::AND)
  15661. return SDValue();
  15662. ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
  15663. if (!CMask || CMask->getZExtValue() != Mask)
  15664. return SDValue();
  15665. return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
  15666. N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
  15667. }
  15668. static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
  15669. SelectionDAG &DAG) {
  15670. SDLoc dl(N);
  15671. return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
  15672. DAG.getNode(Opc, dl,
  15673. N->getOperand(1).getSimpleValueType(),
  15674. N->getOperand(1)),
  15675. DAG.getConstant(0, dl, MVT::i64));
  15676. }
  15677. static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
  15678. SDLoc DL(N);
  15679. SDValue Op1 = N->getOperand(1);
  15680. SDValue Op2 = N->getOperand(2);
  15681. EVT ScalarTy = Op2.getValueType();
  15682. if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
  15683. ScalarTy = MVT::i32;
  15684. // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
  15685. SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
  15686. SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
  15687. SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
  15688. SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
  15689. return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
  15690. }
  15691. static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {
  15692. SDLoc dl(N);
  15693. SDValue Scalar = N->getOperand(3);
  15694. EVT ScalarTy = Scalar.getValueType();
  15695. if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
  15696. Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
  15697. SDValue Passthru = N->getOperand(1);
  15698. SDValue Pred = N->getOperand(2);
  15699. return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
  15700. Pred, Scalar, Passthru);
  15701. }
  15702. static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
  15703. SDLoc dl(N);
  15704. LLVMContext &Ctx = *DAG.getContext();
  15705. EVT VT = N->getValueType(0);
  15706. assert(VT.isScalableVector() && "Expected a scalable vector.");
  15707. // Current lowering only supports the SVE-ACLE types.
  15708. if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
  15709. return SDValue();
  15710. unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
  15711. unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
  15712. EVT ByteVT =
  15713. EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
  15714. // Convert everything to the domain of EXT (i.e bytes).
  15715. SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
  15716. SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
  15717. SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
  15718. DAG.getConstant(ElemSize, dl, MVT::i32));
  15719. SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
  15720. return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
  15721. }
  15722. static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
  15723. TargetLowering::DAGCombinerInfo &DCI,
  15724. SelectionDAG &DAG) {
  15725. if (DCI.isBeforeLegalize())
  15726. return SDValue();
  15727. SDValue Comparator = N->getOperand(3);
  15728. if (Comparator.getOpcode() == AArch64ISD::DUP ||
  15729. Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
  15730. unsigned IID = getIntrinsicID(N);
  15731. EVT VT = N->getValueType(0);
  15732. EVT CmpVT = N->getOperand(2).getValueType();
  15733. SDValue Pred = N->getOperand(1);
  15734. SDValue Imm;
  15735. SDLoc DL(N);
  15736. switch (IID) {
  15737. default:
  15738. llvm_unreachable("Called with wrong intrinsic!");
  15739. break;
  15740. // Signed comparisons
  15741. case Intrinsic::aarch64_sve_cmpeq_wide:
  15742. case Intrinsic::aarch64_sve_cmpne_wide:
  15743. case Intrinsic::aarch64_sve_cmpge_wide:
  15744. case Intrinsic::aarch64_sve_cmpgt_wide:
  15745. case Intrinsic::aarch64_sve_cmplt_wide:
  15746. case Intrinsic::aarch64_sve_cmple_wide: {
  15747. if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
  15748. int64_t ImmVal = CN->getSExtValue();
  15749. if (ImmVal >= -16 && ImmVal <= 15)
  15750. Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
  15751. else
  15752. return SDValue();
  15753. }
  15754. break;
  15755. }
  15756. // Unsigned comparisons
  15757. case Intrinsic::aarch64_sve_cmphs_wide:
  15758. case Intrinsic::aarch64_sve_cmphi_wide:
  15759. case Intrinsic::aarch64_sve_cmplo_wide:
  15760. case Intrinsic::aarch64_sve_cmpls_wide: {
  15761. if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
  15762. uint64_t ImmVal = CN->getZExtValue();
  15763. if (ImmVal <= 127)
  15764. Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
  15765. else
  15766. return SDValue();
  15767. }
  15768. break;
  15769. }
  15770. }
  15771. if (!Imm)
  15772. return SDValue();
  15773. SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
  15774. return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
  15775. N->getOperand(2), Splat, DAG.getCondCode(CC));
  15776. }
  15777. return SDValue();
  15778. }
  15779. static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
  15780. AArch64CC::CondCode Cond) {
  15781. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  15782. SDLoc DL(Op);
  15783. assert(Op.getValueType().isScalableVector() &&
  15784. TLI.isTypeLegal(Op.getValueType()) &&
  15785. "Expected legal scalable vector type!");
  15786. assert(Op.getValueType() == Pg.getValueType() &&
  15787. "Expected same type for PTEST operands");
  15788. // Ensure target specific opcodes are using legal type.
  15789. EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
  15790. SDValue TVal = DAG.getConstant(1, DL, OutVT);
  15791. SDValue FVal = DAG.getConstant(0, DL, OutVT);
  15792. // Ensure operands have type nxv16i1.
  15793. if (Op.getValueType() != MVT::nxv16i1) {
  15794. if ((Cond == AArch64CC::ANY_ACTIVE || Cond == AArch64CC::NONE_ACTIVE) &&
  15795. isZeroingInactiveLanes(Op))
  15796. Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
  15797. else
  15798. Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
  15799. Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
  15800. }
  15801. // Set condition code (CC) flags.
  15802. SDValue Test = DAG.getNode(
  15803. Cond == AArch64CC::ANY_ACTIVE ? AArch64ISD::PTEST_ANY : AArch64ISD::PTEST,
  15804. DL, MVT::Other, Pg, Op);
  15805. // Convert CC to integer based on requested condition.
  15806. // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
  15807. SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
  15808. SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
  15809. return DAG.getZExtOrTrunc(Res, DL, VT);
  15810. }
  15811. static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc,
  15812. SelectionDAG &DAG) {
  15813. SDLoc DL(N);
  15814. SDValue Pred = N->getOperand(1);
  15815. SDValue VecToReduce = N->getOperand(2);
  15816. // NOTE: The integer reduction's result type is not always linked to the
  15817. // operand's element type so we construct it from the intrinsic's result type.
  15818. EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
  15819. SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
  15820. // SVE reductions set the whole vector register with the first element
  15821. // containing the reduction result, which we'll now extract.
  15822. SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
  15823. return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
  15824. Zero);
  15825. }
  15826. static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc,
  15827. SelectionDAG &DAG) {
  15828. SDLoc DL(N);
  15829. SDValue Pred = N->getOperand(1);
  15830. SDValue VecToReduce = N->getOperand(2);
  15831. EVT ReduceVT = VecToReduce.getValueType();
  15832. SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
  15833. // SVE reductions set the whole vector register with the first element
  15834. // containing the reduction result, which we'll now extract.
  15835. SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
  15836. return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
  15837. Zero);
  15838. }
  15839. static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
  15840. SelectionDAG &DAG) {
  15841. SDLoc DL(N);
  15842. SDValue Pred = N->getOperand(1);
  15843. SDValue InitVal = N->getOperand(2);
  15844. SDValue VecToReduce = N->getOperand(3);
  15845. EVT ReduceVT = VecToReduce.getValueType();
  15846. // Ordered reductions use the first lane of the result vector as the
  15847. // reduction's initial value.
  15848. SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
  15849. InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
  15850. DAG.getUNDEF(ReduceVT), InitVal, Zero);
  15851. SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
  15852. // SVE reductions set the whole vector register with the first element
  15853. // containing the reduction result, which we'll now extract.
  15854. return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
  15855. Zero);
  15856. }
  15857. // If a merged operation has no inactive lanes we can relax it to a predicated
  15858. // or unpredicated operation, which potentially allows better isel (perhaps
  15859. // using immediate forms) or relaxing register reuse requirements.
  15860. static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
  15861. SelectionDAG &DAG, bool UnpredOp = false,
  15862. bool SwapOperands = false) {
  15863. assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
  15864. assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
  15865. SDValue Pg = N->getOperand(1);
  15866. SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
  15867. SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
  15868. // ISD way to specify an all active predicate.
  15869. if (isAllActivePredicate(DAG, Pg)) {
  15870. if (UnpredOp)
  15871. return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
  15872. return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
  15873. }
  15874. // FUTURE: SplatVector(true)
  15875. return SDValue();
  15876. }
  15877. static SDValue performIntrinsicCombine(SDNode *N,
  15878. TargetLowering::DAGCombinerInfo &DCI,
  15879. const AArch64Subtarget *Subtarget) {
  15880. SelectionDAG &DAG = DCI.DAG;
  15881. unsigned IID = getIntrinsicID(N);
  15882. switch (IID) {
  15883. default:
  15884. break;
  15885. case Intrinsic::get_active_lane_mask: {
  15886. SDValue Res = SDValue();
  15887. EVT VT = N->getValueType(0);
  15888. if (VT.isFixedLengthVector()) {
  15889. // We can use the SVE whilelo instruction to lower this intrinsic by
  15890. // creating the appropriate sequence of scalable vector operations and
  15891. // then extracting a fixed-width subvector from the scalable vector.
  15892. SDLoc DL(N);
  15893. SDValue ID =
  15894. DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
  15895. EVT WhileVT = EVT::getVectorVT(
  15896. *DAG.getContext(), MVT::i1,
  15897. ElementCount::getScalable(VT.getVectorNumElements()));
  15898. // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
  15899. EVT PromVT = getPromotedVTForPredicate(WhileVT);
  15900. // Get the fixed-width equivalent of PromVT for extraction.
  15901. EVT ExtVT =
  15902. EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(),
  15903. VT.getVectorElementCount());
  15904. Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
  15905. N->getOperand(1), N->getOperand(2));
  15906. Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
  15907. Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
  15908. DAG.getConstant(0, DL, MVT::i64));
  15909. Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
  15910. }
  15911. return Res;
  15912. }
  15913. case Intrinsic::aarch64_neon_vcvtfxs2fp:
  15914. case Intrinsic::aarch64_neon_vcvtfxu2fp:
  15915. return tryCombineFixedPointConvert(N, DCI, DAG);
  15916. case Intrinsic::aarch64_neon_saddv:
  15917. return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
  15918. case Intrinsic::aarch64_neon_uaddv:
  15919. return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
  15920. case Intrinsic::aarch64_neon_sminv:
  15921. return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
  15922. case Intrinsic::aarch64_neon_uminv:
  15923. return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
  15924. case Intrinsic::aarch64_neon_smaxv:
  15925. return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
  15926. case Intrinsic::aarch64_neon_umaxv:
  15927. return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
  15928. case Intrinsic::aarch64_neon_fmax:
  15929. return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
  15930. N->getOperand(1), N->getOperand(2));
  15931. case Intrinsic::aarch64_neon_fmin:
  15932. return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
  15933. N->getOperand(1), N->getOperand(2));
  15934. case Intrinsic::aarch64_neon_fmaxnm:
  15935. return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
  15936. N->getOperand(1), N->getOperand(2));
  15937. case Intrinsic::aarch64_neon_fminnm:
  15938. return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
  15939. N->getOperand(1), N->getOperand(2));
  15940. case Intrinsic::aarch64_neon_smull:
  15941. return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
  15942. N->getOperand(1), N->getOperand(2));
  15943. case Intrinsic::aarch64_neon_umull:
  15944. return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
  15945. N->getOperand(1), N->getOperand(2));
  15946. case Intrinsic::aarch64_neon_pmull:
  15947. return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
  15948. N->getOperand(1), N->getOperand(2));
  15949. case Intrinsic::aarch64_neon_sqdmull:
  15950. return tryCombineLongOpWithDup(IID, N, DCI, DAG);
  15951. case Intrinsic::aarch64_neon_sqshl:
  15952. case Intrinsic::aarch64_neon_uqshl:
  15953. case Intrinsic::aarch64_neon_sqshlu:
  15954. case Intrinsic::aarch64_neon_srshl:
  15955. case Intrinsic::aarch64_neon_urshl:
  15956. case Intrinsic::aarch64_neon_sshl:
  15957. case Intrinsic::aarch64_neon_ushl:
  15958. return tryCombineShiftImm(IID, N, DAG);
  15959. case Intrinsic::aarch64_neon_rshrn: {
  15960. EVT VT = N->getOperand(1).getValueType();
  15961. SDLoc DL(N);
  15962. SDValue Imm =
  15963. DAG.getConstant(1LLU << (N->getConstantOperandVal(2) - 1), DL, VT);
  15964. SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(1), Imm);
  15965. SDValue Sht =
  15966. DAG.getNode(ISD::SRL, DL, VT, Add,
  15967. DAG.getConstant(N->getConstantOperandVal(2), DL, VT));
  15968. return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), Sht);
  15969. }
  15970. case Intrinsic::aarch64_crc32b:
  15971. case Intrinsic::aarch64_crc32cb:
  15972. return tryCombineCRC32(0xff, N, DAG);
  15973. case Intrinsic::aarch64_crc32h:
  15974. case Intrinsic::aarch64_crc32ch:
  15975. return tryCombineCRC32(0xffff, N, DAG);
  15976. case Intrinsic::aarch64_sve_saddv:
  15977. // There is no i64 version of SADDV because the sign is irrelevant.
  15978. if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
  15979. return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
  15980. else
  15981. return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
  15982. case Intrinsic::aarch64_sve_uaddv:
  15983. return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
  15984. case Intrinsic::aarch64_sve_smaxv:
  15985. return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG);
  15986. case Intrinsic::aarch64_sve_umaxv:
  15987. return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG);
  15988. case Intrinsic::aarch64_sve_sminv:
  15989. return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG);
  15990. case Intrinsic::aarch64_sve_uminv:
  15991. return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG);
  15992. case Intrinsic::aarch64_sve_orv:
  15993. return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG);
  15994. case Intrinsic::aarch64_sve_eorv:
  15995. return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG);
  15996. case Intrinsic::aarch64_sve_andv:
  15997. return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG);
  15998. case Intrinsic::aarch64_sve_index:
  15999. return LowerSVEIntrinsicIndex(N, DAG);
  16000. case Intrinsic::aarch64_sve_dup:
  16001. return LowerSVEIntrinsicDUP(N, DAG);
  16002. case Intrinsic::aarch64_sve_dup_x:
  16003. return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
  16004. N->getOperand(1));
  16005. case Intrinsic::aarch64_sve_ext:
  16006. return LowerSVEIntrinsicEXT(N, DAG);
  16007. case Intrinsic::aarch64_sve_mul:
  16008. return convertMergedOpToPredOp(N, AArch64ISD::MUL_PRED, DAG);
  16009. case Intrinsic::aarch64_sve_mul_u:
  16010. return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
  16011. N->getOperand(1), N->getOperand(2), N->getOperand(3));
  16012. case Intrinsic::aarch64_sve_smulh:
  16013. return convertMergedOpToPredOp(N, AArch64ISD::MULHS_PRED, DAG);
  16014. case Intrinsic::aarch64_sve_smulh_u:
  16015. return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
  16016. N->getOperand(1), N->getOperand(2), N->getOperand(3));
  16017. case Intrinsic::aarch64_sve_umulh:
  16018. return convertMergedOpToPredOp(N, AArch64ISD::MULHU_PRED, DAG);
  16019. case Intrinsic::aarch64_sve_umulh_u:
  16020. return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
  16021. N->getOperand(1), N->getOperand(2), N->getOperand(3));
  16022. case Intrinsic::aarch64_sve_smin:
  16023. return convertMergedOpToPredOp(N, AArch64ISD::SMIN_PRED, DAG);
  16024. case Intrinsic::aarch64_sve_smin_u:
  16025. return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
  16026. N->getOperand(1), N->getOperand(2), N->getOperand(3));
  16027. case Intrinsic::aarch64_sve_umin:
  16028. return convertMergedOpToPredOp(N, AArch64ISD::UMIN_PRED, DAG);
  16029. case Intrinsic::aarch64_sve_umin_u:
  16030. return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
  16031. N->getOperand(1), N->getOperand(2), N->getOperand(3));
  16032. case Intrinsic::aarch64_sve_smax:
  16033. return convertMergedOpToPredOp(N, AArch64ISD::SMAX_PRED, DAG);
  16034. case Intrinsic::aarch64_sve_smax_u:
  16035. return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
  16036. N->getOperand(1), N->getOperand(2), N->getOperand(3));
  16037. case Intrinsic::aarch64_sve_umax:
  16038. return convertMergedOpToPredOp(N, AArch64ISD::UMAX_PRED, DAG);
  16039. case Intrinsic::aarch64_sve_umax_u:
  16040. return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
  16041. N->getOperand(1), N->getOperand(2), N->getOperand(3));
  16042. case Intrinsic::aarch64_sve_lsl:
  16043. return convertMergedOpToPredOp(N, AArch64ISD::SHL_PRED, DAG);
  16044. case Intrinsic::aarch64_sve_lsl_u:
  16045. return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
  16046. N->getOperand(1), N->getOperand(2), N->getOperand(3));
  16047. case Intrinsic::aarch64_sve_lsr:
  16048. return convertMergedOpToPredOp(N, AArch64ISD::SRL_PRED, DAG);
  16049. case Intrinsic::aarch64_sve_lsr_u:
  16050. return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
  16051. N->getOperand(1), N->getOperand(2), N->getOperand(3));
  16052. case Intrinsic::aarch64_sve_asr:
  16053. return convertMergedOpToPredOp(N, AArch64ISD::SRA_PRED, DAG);
  16054. case Intrinsic::aarch64_sve_asr_u:
  16055. return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
  16056. N->getOperand(1), N->getOperand(2), N->getOperand(3));
  16057. case Intrinsic::aarch64_sve_fadd:
  16058. return convertMergedOpToPredOp(N, AArch64ISD::FADD_PRED, DAG);
  16059. case Intrinsic::aarch64_sve_fsub:
  16060. return convertMergedOpToPredOp(N, AArch64ISD::FSUB_PRED, DAG);
  16061. case Intrinsic::aarch64_sve_fmul:
  16062. return convertMergedOpToPredOp(N, AArch64ISD::FMUL_PRED, DAG);
  16063. case Intrinsic::aarch64_sve_add:
  16064. return convertMergedOpToPredOp(N, ISD::ADD, DAG, true);
  16065. case Intrinsic::aarch64_sve_add_u:
  16066. return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
  16067. N->getOperand(3));
  16068. case Intrinsic::aarch64_sve_sub:
  16069. return convertMergedOpToPredOp(N, ISD::SUB, DAG, true);
  16070. case Intrinsic::aarch64_sve_sub_u:
  16071. return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
  16072. N->getOperand(3));
  16073. case Intrinsic::aarch64_sve_subr:
  16074. return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
  16075. case Intrinsic::aarch64_sve_and:
  16076. return convertMergedOpToPredOp(N, ISD::AND, DAG, true);
  16077. case Intrinsic::aarch64_sve_bic:
  16078. return convertMergedOpToPredOp(N, AArch64ISD::BIC, DAG, true);
  16079. case Intrinsic::aarch64_sve_eor:
  16080. return convertMergedOpToPredOp(N, ISD::XOR, DAG, true);
  16081. case Intrinsic::aarch64_sve_orr:
  16082. return convertMergedOpToPredOp(N, ISD::OR, DAG, true);
  16083. case Intrinsic::aarch64_sve_sabd:
  16084. return convertMergedOpToPredOp(N, ISD::ABDS, DAG, true);
  16085. case Intrinsic::aarch64_sve_sabd_u:
  16086. return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
  16087. N->getOperand(2), N->getOperand(3));
  16088. case Intrinsic::aarch64_sve_uabd:
  16089. return convertMergedOpToPredOp(N, ISD::ABDU, DAG, true);
  16090. case Intrinsic::aarch64_sve_uabd_u:
  16091. return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
  16092. N->getOperand(2), N->getOperand(3));
  16093. case Intrinsic::aarch64_sve_sdiv_u:
  16094. return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
  16095. N->getOperand(1), N->getOperand(2), N->getOperand(3));
  16096. case Intrinsic::aarch64_sve_udiv_u:
  16097. return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
  16098. N->getOperand(1), N->getOperand(2), N->getOperand(3));
  16099. case Intrinsic::aarch64_sve_sqadd:
  16100. return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
  16101. case Intrinsic::aarch64_sve_sqsub:
  16102. return convertMergedOpToPredOp(N, ISD::SSUBSAT, DAG, true);
  16103. case Intrinsic::aarch64_sve_uqadd:
  16104. return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
  16105. case Intrinsic::aarch64_sve_uqsub:
  16106. return convertMergedOpToPredOp(N, ISD::USUBSAT, DAG, true);
  16107. case Intrinsic::aarch64_sve_sqadd_x:
  16108. return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
  16109. N->getOperand(1), N->getOperand(2));
  16110. case Intrinsic::aarch64_sve_sqsub_x:
  16111. return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
  16112. N->getOperand(1), N->getOperand(2));
  16113. case Intrinsic::aarch64_sve_uqadd_x:
  16114. return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
  16115. N->getOperand(1), N->getOperand(2));
  16116. case Intrinsic::aarch64_sve_uqsub_x:
  16117. return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
  16118. N->getOperand(1), N->getOperand(2));
  16119. case Intrinsic::aarch64_sve_asrd:
  16120. return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
  16121. N->getOperand(1), N->getOperand(2), N->getOperand(3));
  16122. case Intrinsic::aarch64_sve_cmphs:
  16123. if (!N->getOperand(2).getValueType().isFloatingPoint())
  16124. return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
  16125. N->getValueType(0), N->getOperand(1), N->getOperand(2),
  16126. N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
  16127. break;
  16128. case Intrinsic::aarch64_sve_cmphi:
  16129. if (!N->getOperand(2).getValueType().isFloatingPoint())
  16130. return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
  16131. N->getValueType(0), N->getOperand(1), N->getOperand(2),
  16132. N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
  16133. break;
  16134. case Intrinsic::aarch64_sve_fcmpge:
  16135. case Intrinsic::aarch64_sve_cmpge:
  16136. return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
  16137. N->getValueType(0), N->getOperand(1), N->getOperand(2),
  16138. N->getOperand(3), DAG.getCondCode(ISD::SETGE));
  16139. break;
  16140. case Intrinsic::aarch64_sve_fcmpgt:
  16141. case Intrinsic::aarch64_sve_cmpgt:
  16142. return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
  16143. N->getValueType(0), N->getOperand(1), N->getOperand(2),
  16144. N->getOperand(3), DAG.getCondCode(ISD::SETGT));
  16145. break;
  16146. case Intrinsic::aarch64_sve_fcmpeq:
  16147. case Intrinsic::aarch64_sve_cmpeq:
  16148. return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
  16149. N->getValueType(0), N->getOperand(1), N->getOperand(2),
  16150. N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
  16151. break;
  16152. case Intrinsic::aarch64_sve_fcmpne:
  16153. case Intrinsic::aarch64_sve_cmpne:
  16154. return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
  16155. N->getValueType(0), N->getOperand(1), N->getOperand(2),
  16156. N->getOperand(3), DAG.getCondCode(ISD::SETNE));
  16157. break;
  16158. case Intrinsic::aarch64_sve_fcmpuo:
  16159. return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
  16160. N->getValueType(0), N->getOperand(1), N->getOperand(2),
  16161. N->getOperand(3), DAG.getCondCode(ISD::SETUO));
  16162. break;
  16163. case Intrinsic::aarch64_sve_fadda:
  16164. return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
  16165. case Intrinsic::aarch64_sve_faddv:
  16166. return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
  16167. case Intrinsic::aarch64_sve_fmaxnmv:
  16168. return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
  16169. case Intrinsic::aarch64_sve_fmaxv:
  16170. return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
  16171. case Intrinsic::aarch64_sve_fminnmv:
  16172. return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
  16173. case Intrinsic::aarch64_sve_fminv:
  16174. return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
  16175. case Intrinsic::aarch64_sve_sel:
  16176. return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
  16177. N->getOperand(1), N->getOperand(2), N->getOperand(3));
  16178. case Intrinsic::aarch64_sve_cmpeq_wide:
  16179. return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
  16180. case Intrinsic::aarch64_sve_cmpne_wide:
  16181. return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
  16182. case Intrinsic::aarch64_sve_cmpge_wide:
  16183. return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
  16184. case Intrinsic::aarch64_sve_cmpgt_wide:
  16185. return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
  16186. case Intrinsic::aarch64_sve_cmplt_wide:
  16187. return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
  16188. case Intrinsic::aarch64_sve_cmple_wide:
  16189. return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
  16190. case Intrinsic::aarch64_sve_cmphs_wide:
  16191. return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
  16192. case Intrinsic::aarch64_sve_cmphi_wide:
  16193. return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
  16194. case Intrinsic::aarch64_sve_cmplo_wide:
  16195. return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
  16196. case Intrinsic::aarch64_sve_cmpls_wide:
  16197. return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
  16198. case Intrinsic::aarch64_sve_ptest_any:
  16199. return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
  16200. AArch64CC::ANY_ACTIVE);
  16201. case Intrinsic::aarch64_sve_ptest_first:
  16202. return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
  16203. AArch64CC::FIRST_ACTIVE);
  16204. case Intrinsic::aarch64_sve_ptest_last:
  16205. return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
  16206. AArch64CC::LAST_ACTIVE);
  16207. }
  16208. return SDValue();
  16209. }
  16210. static bool isCheapToExtend(const SDValue &N) {
  16211. unsigned OC = N->getOpcode();
  16212. return OC == ISD::LOAD || OC == ISD::MLOAD ||
  16213. ISD::isConstantSplatVectorAllZeros(N.getNode());
  16214. }
  16215. static SDValue
  16216. performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
  16217. SelectionDAG &DAG) {
  16218. // If we have (sext (setcc A B)) and A and B are cheap to extend,
  16219. // we can move the sext into the arguments and have the same result. For
  16220. // example, if A and B are both loads, we can make those extending loads and
  16221. // avoid an extra instruction. This pattern appears often in VLS code
  16222. // generation where the inputs to the setcc have a different size to the
  16223. // instruction that wants to use the result of the setcc.
  16224. assert(N->getOpcode() == ISD::SIGN_EXTEND &&
  16225. N->getOperand(0)->getOpcode() == ISD::SETCC);
  16226. const SDValue SetCC = N->getOperand(0);
  16227. const SDValue CCOp0 = SetCC.getOperand(0);
  16228. const SDValue CCOp1 = SetCC.getOperand(1);
  16229. if (!CCOp0->getValueType(0).isInteger() ||
  16230. !CCOp1->getValueType(0).isInteger())
  16231. return SDValue();
  16232. ISD::CondCode Code =
  16233. cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
  16234. ISD::NodeType ExtType =
  16235. isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
  16236. if (isCheapToExtend(SetCC.getOperand(0)) &&
  16237. isCheapToExtend(SetCC.getOperand(1))) {
  16238. const SDValue Ext1 =
  16239. DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
  16240. const SDValue Ext2 =
  16241. DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
  16242. return DAG.getSetCC(
  16243. SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
  16244. cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
  16245. }
  16246. return SDValue();
  16247. }
  16248. static SDValue performExtendCombine(SDNode *N,
  16249. TargetLowering::DAGCombinerInfo &DCI,
  16250. SelectionDAG &DAG) {
  16251. // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
  16252. // we can convert that DUP into another extract_high (of a bigger DUP), which
  16253. // helps the backend to decide that an sabdl2 would be useful, saving a real
  16254. // extract_high operation.
  16255. if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
  16256. (N->getOperand(0).getOpcode() == ISD::ABDU ||
  16257. N->getOperand(0).getOpcode() == ISD::ABDS)) {
  16258. SDNode *ABDNode = N->getOperand(0).getNode();
  16259. SDValue NewABD =
  16260. tryCombineLongOpWithDup(Intrinsic::not_intrinsic, ABDNode, DCI, DAG);
  16261. if (!NewABD.getNode())
  16262. return SDValue();
  16263. return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
  16264. }
  16265. if (N->getValueType(0).isFixedLengthVector() &&
  16266. N->getOpcode() == ISD::SIGN_EXTEND &&
  16267. N->getOperand(0)->getOpcode() == ISD::SETCC)
  16268. return performSignExtendSetCCCombine(N, DCI, DAG);
  16269. return SDValue();
  16270. }
  16271. static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
  16272. SDValue SplatVal, unsigned NumVecElts) {
  16273. assert(!St.isTruncatingStore() && "cannot split truncating vector store");
  16274. Align OrigAlignment = St.getAlign();
  16275. unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
  16276. // Create scalar stores. This is at least as good as the code sequence for a
  16277. // split unaligned store which is a dup.s, ext.b, and two stores.
  16278. // Most of the time the three stores should be replaced by store pair
  16279. // instructions (stp).
  16280. SDLoc DL(&St);
  16281. SDValue BasePtr = St.getBasePtr();
  16282. uint64_t BaseOffset = 0;
  16283. const MachinePointerInfo &PtrInfo = St.getPointerInfo();
  16284. SDValue NewST1 =
  16285. DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
  16286. OrigAlignment, St.getMemOperand()->getFlags());
  16287. // As this in ISel, we will not merge this add which may degrade results.
  16288. if (BasePtr->getOpcode() == ISD::ADD &&
  16289. isa<ConstantSDNode>(BasePtr->getOperand(1))) {
  16290. BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
  16291. BasePtr = BasePtr->getOperand(0);
  16292. }
  16293. unsigned Offset = EltOffset;
  16294. while (--NumVecElts) {
  16295. Align Alignment = commonAlignment(OrigAlignment, Offset);
  16296. SDValue OffsetPtr =
  16297. DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
  16298. DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
  16299. NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
  16300. PtrInfo.getWithOffset(Offset), Alignment,
  16301. St.getMemOperand()->getFlags());
  16302. Offset += EltOffset;
  16303. }
  16304. return NewST1;
  16305. }
  16306. // Returns an SVE type that ContentTy can be trivially sign or zero extended
  16307. // into.
  16308. static MVT getSVEContainerType(EVT ContentTy) {
  16309. assert(ContentTy.isSimple() && "No SVE containers for extended types");
  16310. switch (ContentTy.getSimpleVT().SimpleTy) {
  16311. default:
  16312. llvm_unreachable("No known SVE container for this MVT type");
  16313. case MVT::nxv2i8:
  16314. case MVT::nxv2i16:
  16315. case MVT::nxv2i32:
  16316. case MVT::nxv2i64:
  16317. case MVT::nxv2f32:
  16318. case MVT::nxv2f64:
  16319. return MVT::nxv2i64;
  16320. case MVT::nxv4i8:
  16321. case MVT::nxv4i16:
  16322. case MVT::nxv4i32:
  16323. case MVT::nxv4f32:
  16324. return MVT::nxv4i32;
  16325. case MVT::nxv8i8:
  16326. case MVT::nxv8i16:
  16327. case MVT::nxv8f16:
  16328. case MVT::nxv8bf16:
  16329. return MVT::nxv8i16;
  16330. case MVT::nxv16i8:
  16331. return MVT::nxv16i8;
  16332. }
  16333. }
  16334. static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
  16335. SDLoc DL(N);
  16336. EVT VT = N->getValueType(0);
  16337. if (VT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
  16338. return SDValue();
  16339. EVT ContainerVT = VT;
  16340. if (ContainerVT.isInteger())
  16341. ContainerVT = getSVEContainerType(ContainerVT);
  16342. SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
  16343. SDValue Ops[] = { N->getOperand(0), // Chain
  16344. N->getOperand(2), // Pg
  16345. N->getOperand(3), // Base
  16346. DAG.getValueType(VT) };
  16347. SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
  16348. SDValue LoadChain = SDValue(Load.getNode(), 1);
  16349. if (ContainerVT.isInteger() && (VT != ContainerVT))
  16350. Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
  16351. return DAG.getMergeValues({ Load, LoadChain }, DL);
  16352. }
  16353. static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
  16354. SDLoc DL(N);
  16355. EVT VT = N->getValueType(0);
  16356. EVT PtrTy = N->getOperand(3).getValueType();
  16357. EVT LoadVT = VT;
  16358. if (VT.isFloatingPoint())
  16359. LoadVT = VT.changeTypeToInteger();
  16360. auto *MINode = cast<MemIntrinsicSDNode>(N);
  16361. SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
  16362. SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
  16363. MINode->getOperand(3), DAG.getUNDEF(PtrTy),
  16364. MINode->getOperand(2), PassThru,
  16365. MINode->getMemoryVT(), MINode->getMemOperand(),
  16366. ISD::UNINDEXED, ISD::NON_EXTLOAD, false);
  16367. if (VT.isFloatingPoint()) {
  16368. SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
  16369. return DAG.getMergeValues(Ops, DL);
  16370. }
  16371. return L;
  16372. }
  16373. template <unsigned Opcode>
  16374. static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
  16375. static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
  16376. Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
  16377. "Unsupported opcode.");
  16378. SDLoc DL(N);
  16379. EVT VT = N->getValueType(0);
  16380. EVT LoadVT = VT;
  16381. if (VT.isFloatingPoint())
  16382. LoadVT = VT.changeTypeToInteger();
  16383. SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
  16384. SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
  16385. SDValue LoadChain = SDValue(Load.getNode(), 1);
  16386. if (VT.isFloatingPoint())
  16387. Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
  16388. return DAG.getMergeValues({Load, LoadChain}, DL);
  16389. }
  16390. static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
  16391. SDLoc DL(N);
  16392. SDValue Data = N->getOperand(2);
  16393. EVT DataVT = Data.getValueType();
  16394. EVT HwSrcVt = getSVEContainerType(DataVT);
  16395. SDValue InputVT = DAG.getValueType(DataVT);
  16396. if (DataVT.isFloatingPoint())
  16397. InputVT = DAG.getValueType(HwSrcVt);
  16398. SDValue SrcNew;
  16399. if (Data.getValueType().isFloatingPoint())
  16400. SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
  16401. else
  16402. SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
  16403. SDValue Ops[] = { N->getOperand(0), // Chain
  16404. SrcNew,
  16405. N->getOperand(4), // Base
  16406. N->getOperand(3), // Pg
  16407. InputVT
  16408. };
  16409. return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
  16410. }
  16411. static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
  16412. SDLoc DL(N);
  16413. SDValue Data = N->getOperand(2);
  16414. EVT DataVT = Data.getValueType();
  16415. EVT PtrTy = N->getOperand(4).getValueType();
  16416. if (DataVT.isFloatingPoint())
  16417. Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
  16418. auto *MINode = cast<MemIntrinsicSDNode>(N);
  16419. return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
  16420. DAG.getUNDEF(PtrTy), MINode->getOperand(3),
  16421. MINode->getMemoryVT(), MINode->getMemOperand(),
  16422. ISD::UNINDEXED, false, false);
  16423. }
  16424. /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
  16425. /// load store optimizer pass will merge them to store pair stores. This should
  16426. /// be better than a movi to create the vector zero followed by a vector store
  16427. /// if the zero constant is not re-used, since one instructions and one register
  16428. /// live range will be removed.
  16429. ///
  16430. /// For example, the final generated code should be:
  16431. ///
  16432. /// stp xzr, xzr, [x0]
  16433. ///
  16434. /// instead of:
  16435. ///
  16436. /// movi v0.2d, #0
  16437. /// str q0, [x0]
  16438. ///
  16439. static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
  16440. SDValue StVal = St.getValue();
  16441. EVT VT = StVal.getValueType();
  16442. // Avoid scalarizing zero splat stores for scalable vectors.
  16443. if (VT.isScalableVector())
  16444. return SDValue();
  16445. // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
  16446. // 2, 3 or 4 i32 elements.
  16447. int NumVecElts = VT.getVectorNumElements();
  16448. if (!(((NumVecElts == 2 || NumVecElts == 3) &&
  16449. VT.getVectorElementType().getSizeInBits() == 64) ||
  16450. ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
  16451. VT.getVectorElementType().getSizeInBits() == 32)))
  16452. return SDValue();
  16453. if (StVal.getOpcode() != ISD::BUILD_VECTOR)
  16454. return SDValue();
  16455. // If the zero constant has more than one use then the vector store could be
  16456. // better since the constant mov will be amortized and stp q instructions
  16457. // should be able to be formed.
  16458. if (!StVal.hasOneUse())
  16459. return SDValue();
  16460. // If the store is truncating then it's going down to i16 or smaller, which
  16461. // means it can be implemented in a single store anyway.
  16462. if (St.isTruncatingStore())
  16463. return SDValue();
  16464. // If the immediate offset of the address operand is too large for the stp
  16465. // instruction, then bail out.
  16466. if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
  16467. int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
  16468. if (Offset < -512 || Offset > 504)
  16469. return SDValue();
  16470. }
  16471. for (int I = 0; I < NumVecElts; ++I) {
  16472. SDValue EltVal = StVal.getOperand(I);
  16473. if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
  16474. return SDValue();
  16475. }
  16476. // Use a CopyFromReg WZR/XZR here to prevent
  16477. // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
  16478. SDLoc DL(&St);
  16479. unsigned ZeroReg;
  16480. EVT ZeroVT;
  16481. if (VT.getVectorElementType().getSizeInBits() == 32) {
  16482. ZeroReg = AArch64::WZR;
  16483. ZeroVT = MVT::i32;
  16484. } else {
  16485. ZeroReg = AArch64::XZR;
  16486. ZeroVT = MVT::i64;
  16487. }
  16488. SDValue SplatVal =
  16489. DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
  16490. return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
  16491. }
  16492. /// Replace a splat of a scalar to a vector store by scalar stores of the scalar
  16493. /// value. The load store optimizer pass will merge them to store pair stores.
  16494. /// This has better performance than a splat of the scalar followed by a split
  16495. /// vector store. Even if the stores are not merged it is four stores vs a dup,
  16496. /// followed by an ext.b and two stores.
  16497. static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
  16498. SDValue StVal = St.getValue();
  16499. EVT VT = StVal.getValueType();
  16500. // Don't replace floating point stores, they possibly won't be transformed to
  16501. // stp because of the store pair suppress pass.
  16502. if (VT.isFloatingPoint())
  16503. return SDValue();
  16504. // We can express a splat as store pair(s) for 2 or 4 elements.
  16505. unsigned NumVecElts = VT.getVectorNumElements();
  16506. if (NumVecElts != 4 && NumVecElts != 2)
  16507. return SDValue();
  16508. // If the store is truncating then it's going down to i16 or smaller, which
  16509. // means it can be implemented in a single store anyway.
  16510. if (St.isTruncatingStore())
  16511. return SDValue();
  16512. // Check that this is a splat.
  16513. // Make sure that each of the relevant vector element locations are inserted
  16514. // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
  16515. std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
  16516. SDValue SplatVal;
  16517. for (unsigned I = 0; I < NumVecElts; ++I) {
  16518. // Check for insert vector elements.
  16519. if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
  16520. return SDValue();
  16521. // Check that same value is inserted at each vector element.
  16522. if (I == 0)
  16523. SplatVal = StVal.getOperand(1);
  16524. else if (StVal.getOperand(1) != SplatVal)
  16525. return SDValue();
  16526. // Check insert element index.
  16527. ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
  16528. if (!CIndex)
  16529. return SDValue();
  16530. uint64_t IndexVal = CIndex->getZExtValue();
  16531. if (IndexVal >= NumVecElts)
  16532. return SDValue();
  16533. IndexNotInserted.reset(IndexVal);
  16534. StVal = StVal.getOperand(0);
  16535. }
  16536. // Check that all vector element locations were inserted to.
  16537. if (IndexNotInserted.any())
  16538. return SDValue();
  16539. return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
  16540. }
  16541. static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
  16542. SelectionDAG &DAG,
  16543. const AArch64Subtarget *Subtarget) {
  16544. StoreSDNode *S = cast<StoreSDNode>(N);
  16545. if (S->isVolatile() || S->isIndexed())
  16546. return SDValue();
  16547. SDValue StVal = S->getValue();
  16548. EVT VT = StVal.getValueType();
  16549. if (!VT.isFixedLengthVector())
  16550. return SDValue();
  16551. // If we get a splat of zeros, convert this vector store to a store of
  16552. // scalars. They will be merged into store pairs of xzr thereby removing one
  16553. // instruction and one register.
  16554. if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
  16555. return ReplacedZeroSplat;
  16556. // FIXME: The logic for deciding if an unaligned store should be split should
  16557. // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
  16558. // a call to that function here.
  16559. if (!Subtarget->isMisaligned128StoreSlow())
  16560. return SDValue();
  16561. // Don't split at -Oz.
  16562. if (DAG.getMachineFunction().getFunction().hasMinSize())
  16563. return SDValue();
  16564. // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
  16565. // those up regresses performance on micro-benchmarks and olden/bh.
  16566. if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
  16567. return SDValue();
  16568. // Split unaligned 16B stores. They are terrible for performance.
  16569. // Don't split stores with alignment of 1 or 2. Code that uses clang vector
  16570. // extensions can use this to mark that it does not want splitting to happen
  16571. // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
  16572. // eliminating alignment hazards is only 1 in 8 for alignment of 2.
  16573. if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
  16574. S->getAlign() <= Align(2))
  16575. return SDValue();
  16576. // If we get a splat of a scalar convert this vector store to a store of
  16577. // scalars. They will be merged into store pairs thereby removing two
  16578. // instructions.
  16579. if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
  16580. return ReplacedSplat;
  16581. SDLoc DL(S);
  16582. // Split VT into two.
  16583. EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
  16584. unsigned NumElts = HalfVT.getVectorNumElements();
  16585. SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
  16586. DAG.getConstant(0, DL, MVT::i64));
  16587. SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
  16588. DAG.getConstant(NumElts, DL, MVT::i64));
  16589. SDValue BasePtr = S->getBasePtr();
  16590. SDValue NewST1 =
  16591. DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
  16592. S->getAlign(), S->getMemOperand()->getFlags());
  16593. SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
  16594. DAG.getConstant(8, DL, MVT::i64));
  16595. return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
  16596. S->getPointerInfo(), S->getAlign(),
  16597. S->getMemOperand()->getFlags());
  16598. }
  16599. static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) {
  16600. assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
  16601. // splice(pg, op1, undef) -> op1
  16602. if (N->getOperand(2).isUndef())
  16603. return N->getOperand(1);
  16604. return SDValue();
  16605. }
  16606. static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG,
  16607. const AArch64Subtarget *Subtarget) {
  16608. assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
  16609. N->getOpcode() == AArch64ISD::UUNPKLO) &&
  16610. "Unexpected Opcode!");
  16611. // uunpklo/hi undef -> undef
  16612. if (N->getOperand(0).isUndef())
  16613. return DAG.getUNDEF(N->getValueType(0));
  16614. // If this is a masked load followed by an UUNPKLO, fold this into a masked
  16615. // extending load. We can do this even if this is already a masked
  16616. // {z,}extload.
  16617. if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
  16618. N->getOpcode() == AArch64ISD::UUNPKLO) {
  16619. MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
  16620. SDValue Mask = MLD->getMask();
  16621. SDLoc DL(N);
  16622. if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
  16623. SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
  16624. (MLD->getPassThru()->isUndef() ||
  16625. isZerosVector(MLD->getPassThru().getNode()))) {
  16626. unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
  16627. unsigned PgPattern = Mask->getConstantOperandVal(0);
  16628. EVT VT = N->getValueType(0);
  16629. // Ensure we can double the size of the predicate pattern
  16630. unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
  16631. if (NumElts &&
  16632. NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
  16633. Mask =
  16634. getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
  16635. SDValue PassThru = DAG.getConstant(0, DL, VT);
  16636. SDValue NewLoad = DAG.getMaskedLoad(
  16637. VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
  16638. PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
  16639. MLD->getAddressingMode(), ISD::ZEXTLOAD);
  16640. DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
  16641. return NewLoad;
  16642. }
  16643. }
  16644. }
  16645. return SDValue();
  16646. }
  16647. static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) {
  16648. SDLoc DL(N);
  16649. SDValue Op0 = N->getOperand(0);
  16650. SDValue Op1 = N->getOperand(1);
  16651. EVT ResVT = N->getValueType(0);
  16652. // uzp1(x, undef) -> concat(truncate(x), undef)
  16653. if (Op1.getOpcode() == ISD::UNDEF) {
  16654. EVT BCVT = MVT::Other, HalfVT = MVT::Other;
  16655. switch (ResVT.getSimpleVT().SimpleTy) {
  16656. default:
  16657. break;
  16658. case MVT::v16i8:
  16659. BCVT = MVT::v8i16;
  16660. HalfVT = MVT::v8i8;
  16661. break;
  16662. case MVT::v8i16:
  16663. BCVT = MVT::v4i32;
  16664. HalfVT = MVT::v4i16;
  16665. break;
  16666. case MVT::v4i32:
  16667. BCVT = MVT::v2i64;
  16668. HalfVT = MVT::v2i32;
  16669. break;
  16670. }
  16671. if (BCVT != MVT::Other) {
  16672. SDValue BC = DAG.getBitcast(BCVT, Op0);
  16673. SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
  16674. return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
  16675. DAG.getUNDEF(HalfVT));
  16676. }
  16677. }
  16678. // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
  16679. if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
  16680. if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
  16681. SDValue X = Op0.getOperand(0).getOperand(0);
  16682. return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
  16683. }
  16684. }
  16685. // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
  16686. if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
  16687. if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
  16688. SDValue Z = Op1.getOperand(0).getOperand(1);
  16689. return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
  16690. }
  16691. }
  16692. // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
  16693. // Only implemented on little-endian subtargets.
  16694. bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
  16695. // This optimization only works on little endian.
  16696. if (!IsLittleEndian)
  16697. return SDValue();
  16698. if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
  16699. return SDValue();
  16700. auto getSourceOp = [](SDValue Operand) -> SDValue {
  16701. const unsigned Opcode = Operand.getOpcode();
  16702. if (Opcode == ISD::TRUNCATE)
  16703. return Operand->getOperand(0);
  16704. if (Opcode == ISD::BITCAST &&
  16705. Operand->getOperand(0).getOpcode() == ISD::TRUNCATE)
  16706. return Operand->getOperand(0)->getOperand(0);
  16707. return SDValue();
  16708. };
  16709. SDValue SourceOp0 = getSourceOp(Op0);
  16710. SDValue SourceOp1 = getSourceOp(Op1);
  16711. if (!SourceOp0 || !SourceOp1)
  16712. return SDValue();
  16713. if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
  16714. !SourceOp0.getValueType().isSimple())
  16715. return SDValue();
  16716. EVT ResultTy;
  16717. switch (SourceOp0.getSimpleValueType().SimpleTy) {
  16718. case MVT::v2i64:
  16719. ResultTy = MVT::v4i32;
  16720. break;
  16721. case MVT::v4i32:
  16722. ResultTy = MVT::v8i16;
  16723. break;
  16724. case MVT::v8i16:
  16725. ResultTy = MVT::v16i8;
  16726. break;
  16727. default:
  16728. return SDValue();
  16729. }
  16730. SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
  16731. SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
  16732. SDValue UzpResult =
  16733. DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
  16734. EVT BitcastResultTy;
  16735. switch (ResVT.getSimpleVT().SimpleTy) {
  16736. case MVT::v2i32:
  16737. BitcastResultTy = MVT::v2i64;
  16738. break;
  16739. case MVT::v4i16:
  16740. BitcastResultTy = MVT::v4i32;
  16741. break;
  16742. case MVT::v8i8:
  16743. BitcastResultTy = MVT::v8i16;
  16744. break;
  16745. default:
  16746. llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
  16747. }
  16748. return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
  16749. DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
  16750. }
  16751. static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) {
  16752. unsigned Opc = N->getOpcode();
  16753. assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
  16754. Opc <= AArch64ISD::GLD1_IMM_MERGE_ZERO) ||
  16755. (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
  16756. Opc <= AArch64ISD::GLD1S_IMM_MERGE_ZERO)) &&
  16757. "Invalid opcode.");
  16758. const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
  16759. Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
  16760. const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
  16761. Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
  16762. const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
  16763. Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
  16764. Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
  16765. Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
  16766. SDLoc DL(N);
  16767. SDValue Chain = N->getOperand(0);
  16768. SDValue Pg = N->getOperand(1);
  16769. SDValue Base = N->getOperand(2);
  16770. SDValue Offset = N->getOperand(3);
  16771. SDValue Ty = N->getOperand(4);
  16772. EVT ResVT = N->getValueType(0);
  16773. const auto OffsetOpc = Offset.getOpcode();
  16774. const bool OffsetIsZExt =
  16775. OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
  16776. const bool OffsetIsSExt =
  16777. OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
  16778. // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
  16779. if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
  16780. SDValue ExtPg = Offset.getOperand(0);
  16781. VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
  16782. EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
  16783. // If the predicate for the sign- or zero-extended offset is the
  16784. // same as the predicate used for this load and the sign-/zero-extension
  16785. // was from a 32-bits...
  16786. if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
  16787. SDValue UnextendedOffset = Offset.getOperand(1);
  16788. unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
  16789. if (Signed)
  16790. NewOpc = getSignExtendedGatherOpcode(NewOpc);
  16791. return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
  16792. {Chain, Pg, Base, UnextendedOffset, Ty});
  16793. }
  16794. }
  16795. return SDValue();
  16796. }
  16797. /// Optimize a vector shift instruction and its operand if shifted out
  16798. /// bits are not used.
  16799. static SDValue performVectorShiftCombine(SDNode *N,
  16800. const AArch64TargetLowering &TLI,
  16801. TargetLowering::DAGCombinerInfo &DCI) {
  16802. assert(N->getOpcode() == AArch64ISD::VASHR ||
  16803. N->getOpcode() == AArch64ISD::VLSHR);
  16804. SDValue Op = N->getOperand(0);
  16805. unsigned OpScalarSize = Op.getScalarValueSizeInBits();
  16806. unsigned ShiftImm = N->getConstantOperandVal(1);
  16807. assert(OpScalarSize > ShiftImm && "Invalid shift imm");
  16808. APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
  16809. APInt DemandedMask = ~ShiftedOutBits;
  16810. if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
  16811. return SDValue(N, 0);
  16812. return SDValue();
  16813. }
  16814. static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG) {
  16815. // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
  16816. // This transform works in partnership with performSetCCPunpkCombine to
  16817. // remove unnecessary transfer of predicates into standard registers and back
  16818. if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
  16819. N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
  16820. MVT::i1) {
  16821. SDValue CC = N->getOperand(0)->getOperand(0);
  16822. auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
  16823. SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
  16824. DAG.getVectorIdxConstant(0, SDLoc(N)));
  16825. return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
  16826. }
  16827. return SDValue();
  16828. }
  16829. /// Target-specific DAG combine function for post-increment LD1 (lane) and
  16830. /// post-increment LD1R.
  16831. static SDValue performPostLD1Combine(SDNode *N,
  16832. TargetLowering::DAGCombinerInfo &DCI,
  16833. bool IsLaneOp) {
  16834. if (DCI.isBeforeLegalizeOps())
  16835. return SDValue();
  16836. SelectionDAG &DAG = DCI.DAG;
  16837. EVT VT = N->getValueType(0);
  16838. if (!VT.is128BitVector() && !VT.is64BitVector())
  16839. return SDValue();
  16840. unsigned LoadIdx = IsLaneOp ? 1 : 0;
  16841. SDNode *LD = N->getOperand(LoadIdx).getNode();
  16842. // If it is not LOAD, can not do such combine.
  16843. if (LD->getOpcode() != ISD::LOAD)
  16844. return SDValue();
  16845. // The vector lane must be a constant in the LD1LANE opcode.
  16846. SDValue Lane;
  16847. if (IsLaneOp) {
  16848. Lane = N->getOperand(2);
  16849. auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
  16850. if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
  16851. return SDValue();
  16852. }
  16853. LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
  16854. EVT MemVT = LoadSDN->getMemoryVT();
  16855. // Check if memory operand is the same type as the vector element.
  16856. if (MemVT != VT.getVectorElementType())
  16857. return SDValue();
  16858. // Check if there are other uses. If so, do not combine as it will introduce
  16859. // an extra load.
  16860. for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
  16861. ++UI) {
  16862. if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
  16863. continue;
  16864. if (*UI != N)
  16865. return SDValue();
  16866. }
  16867. SDValue Addr = LD->getOperand(1);
  16868. SDValue Vector = N->getOperand(0);
  16869. // Search for a use of the address operand that is an increment.
  16870. for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
  16871. Addr.getNode()->use_end(); UI != UE; ++UI) {
  16872. SDNode *User = *UI;
  16873. if (User->getOpcode() != ISD::ADD
  16874. || UI.getUse().getResNo() != Addr.getResNo())
  16875. continue;
  16876. // If the increment is a constant, it must match the memory ref size.
  16877. SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
  16878. if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
  16879. uint32_t IncVal = CInc->getZExtValue();
  16880. unsigned NumBytes = VT.getScalarSizeInBits() / 8;
  16881. if (IncVal != NumBytes)
  16882. continue;
  16883. Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
  16884. }
  16885. // To avoid cycle construction make sure that neither the load nor the add
  16886. // are predecessors to each other or the Vector.
  16887. SmallPtrSet<const SDNode *, 32> Visited;
  16888. SmallVector<const SDNode *, 16> Worklist;
  16889. Visited.insert(Addr.getNode());
  16890. Worklist.push_back(User);
  16891. Worklist.push_back(LD);
  16892. Worklist.push_back(Vector.getNode());
  16893. if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
  16894. SDNode::hasPredecessorHelper(User, Visited, Worklist))
  16895. continue;
  16896. SmallVector<SDValue, 8> Ops;
  16897. Ops.push_back(LD->getOperand(0)); // Chain
  16898. if (IsLaneOp) {
  16899. Ops.push_back(Vector); // The vector to be inserted
  16900. Ops.push_back(Lane); // The lane to be inserted in the vector
  16901. }
  16902. Ops.push_back(Addr);
  16903. Ops.push_back(Inc);
  16904. EVT Tys[3] = { VT, MVT::i64, MVT::Other };
  16905. SDVTList SDTys = DAG.getVTList(Tys);
  16906. unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
  16907. SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
  16908. MemVT,
  16909. LoadSDN->getMemOperand());
  16910. // Update the uses.
  16911. SDValue NewResults[] = {
  16912. SDValue(LD, 0), // The result of load
  16913. SDValue(UpdN.getNode(), 2) // Chain
  16914. };
  16915. DCI.CombineTo(LD, NewResults);
  16916. DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
  16917. DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
  16918. break;
  16919. }
  16920. return SDValue();
  16921. }
  16922. /// Simplify ``Addr`` given that the top byte of it is ignored by HW during
  16923. /// address translation.
  16924. static bool performTBISimplification(SDValue Addr,
  16925. TargetLowering::DAGCombinerInfo &DCI,
  16926. SelectionDAG &DAG) {
  16927. APInt DemandedMask = APInt::getLowBitsSet(64, 56);
  16928. KnownBits Known;
  16929. TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
  16930. !DCI.isBeforeLegalizeOps());
  16931. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  16932. if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
  16933. DCI.CommitTargetLoweringOpt(TLO);
  16934. return true;
  16935. }
  16936. return false;
  16937. }
  16938. static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
  16939. assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
  16940. "Expected STORE dag node in input!");
  16941. if (auto Store = dyn_cast<StoreSDNode>(N)) {
  16942. if (!Store->isTruncatingStore() || Store->isIndexed())
  16943. return SDValue();
  16944. SDValue Ext = Store->getValue();
  16945. auto ExtOpCode = Ext.getOpcode();
  16946. if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
  16947. ExtOpCode != ISD::ANY_EXTEND)
  16948. return SDValue();
  16949. SDValue Orig = Ext->getOperand(0);
  16950. if (Store->getMemoryVT() != Orig.getValueType())
  16951. return SDValue();
  16952. return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
  16953. Store->getBasePtr(), Store->getMemOperand());
  16954. }
  16955. return SDValue();
  16956. }
  16957. // Perform TBI simplification if supported by the target and try to break up
  16958. // nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
  16959. // load instructions can be selected.
  16960. static SDValue performLOADCombine(SDNode *N,
  16961. TargetLowering::DAGCombinerInfo &DCI,
  16962. SelectionDAG &DAG,
  16963. const AArch64Subtarget *Subtarget) {
  16964. if (Subtarget->supportsAddressTopByteIgnored())
  16965. performTBISimplification(N->getOperand(1), DCI, DAG);
  16966. LoadSDNode *LD = cast<LoadSDNode>(N);
  16967. EVT MemVT = LD->getMemoryVT();
  16968. if (LD->isVolatile() || !LD->isNonTemporal() || !Subtarget->isLittleEndian())
  16969. return SDValue(N, 0);
  16970. if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
  16971. MemVT.getSizeInBits() % 256 == 0 ||
  16972. 256 % MemVT.getScalarSizeInBits() != 0)
  16973. return SDValue(N, 0);
  16974. SDLoc DL(LD);
  16975. SDValue Chain = LD->getChain();
  16976. SDValue BasePtr = LD->getBasePtr();
  16977. SDNodeFlags Flags = LD->getFlags();
  16978. SmallVector<SDValue, 4> LoadOps;
  16979. SmallVector<SDValue, 4> LoadOpsChain;
  16980. // Replace any non temporal load over 256-bit with a series of 256 bit loads
  16981. // and a scalar/vector load less than 256. This way we can utilize 256-bit
  16982. // loads and reduce the amount of load instructions generated.
  16983. MVT NewVT =
  16984. MVT::getVectorVT(MemVT.getVectorElementType().getSimpleVT(),
  16985. 256 / MemVT.getVectorElementType().getSizeInBits());
  16986. unsigned Num256Loads = MemVT.getSizeInBits() / 256;
  16987. // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
  16988. for (unsigned I = 0; I < Num256Loads; I++) {
  16989. unsigned PtrOffset = I * 32;
  16990. SDValue NewPtr = DAG.getMemBasePlusOffset(
  16991. BasePtr, TypeSize::Fixed(PtrOffset), DL, Flags);
  16992. Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
  16993. SDValue NewLoad = DAG.getLoad(
  16994. NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
  16995. NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
  16996. LoadOps.push_back(NewLoad);
  16997. LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
  16998. }
  16999. // Process remaining bits of the load operation.
  17000. // This is done by creating an UNDEF vector to match the size of the
  17001. // 256-bit loads and inserting the remaining load to it. We extract the
  17002. // original load type at the end using EXTRACT_SUBVECTOR instruction.
  17003. unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
  17004. unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
  17005. MVT RemainingVT = MVT::getVectorVT(
  17006. MemVT.getVectorElementType().getSimpleVT(),
  17007. BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
  17008. SDValue NewPtr =
  17009. DAG.getMemBasePlusOffset(BasePtr, TypeSize::Fixed(PtrOffset), DL, Flags);
  17010. Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
  17011. SDValue RemainingLoad =
  17012. DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
  17013. LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
  17014. LD->getMemOperand()->getFlags(), LD->getAAInfo());
  17015. SDValue UndefVector = DAG.getUNDEF(NewVT);
  17016. SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
  17017. SDValue ExtendedReminingLoad =
  17018. DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
  17019. {UndefVector, RemainingLoad, InsertIdx});
  17020. LoadOps.push_back(ExtendedReminingLoad);
  17021. LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
  17022. EVT ConcatVT =
  17023. EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
  17024. LoadOps.size() * NewVT.getVectorNumElements());
  17025. SDValue ConcatVectors =
  17026. DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
  17027. // Extract the original vector type size.
  17028. SDValue ExtractSubVector =
  17029. DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
  17030. {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
  17031. SDValue TokenFactor =
  17032. DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
  17033. return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
  17034. }
  17035. static SDValue performSTORECombine(SDNode *N,
  17036. TargetLowering::DAGCombinerInfo &DCI,
  17037. SelectionDAG &DAG,
  17038. const AArch64Subtarget *Subtarget) {
  17039. StoreSDNode *ST = cast<StoreSDNode>(N);
  17040. SDValue Chain = ST->getChain();
  17041. SDValue Value = ST->getValue();
  17042. SDValue Ptr = ST->getBasePtr();
  17043. EVT ValueVT = Value.getValueType();
  17044. auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
  17045. EVT EltVT = VT.getVectorElementType();
  17046. return EltVT == MVT::f32 || EltVT == MVT::f64;
  17047. };
  17048. // If this is an FP_ROUND followed by a store, fold this into a truncating
  17049. // store. We can do this even if this is already a truncstore.
  17050. // We purposefully don't care about legality of the nodes here as we know
  17051. // they can be split down into something legal.
  17052. if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
  17053. Value.getNode()->hasOneUse() && ST->isUnindexed() &&
  17054. Subtarget->useSVEForFixedLengthVectors() &&
  17055. ValueVT.isFixedLengthVector() &&
  17056. ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
  17057. hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
  17058. return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
  17059. ST->getMemoryVT(), ST->getMemOperand());
  17060. if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
  17061. return Split;
  17062. if (Subtarget->supportsAddressTopByteIgnored() &&
  17063. performTBISimplification(N->getOperand(2), DCI, DAG))
  17064. return SDValue(N, 0);
  17065. if (SDValue Store = foldTruncStoreOfExt(DAG, N))
  17066. return Store;
  17067. return SDValue();
  17068. }
  17069. static SDValue performMSTORECombine(SDNode *N,
  17070. TargetLowering::DAGCombinerInfo &DCI,
  17071. SelectionDAG &DAG,
  17072. const AArch64Subtarget *Subtarget) {
  17073. MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
  17074. SDValue Value = MST->getValue();
  17075. SDValue Mask = MST->getMask();
  17076. SDLoc DL(N);
  17077. // If this is a UZP1 followed by a masked store, fold this into a masked
  17078. // truncating store. We can do this even if this is already a masked
  17079. // truncstore.
  17080. if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
  17081. MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
  17082. Value.getValueType().isInteger()) {
  17083. Value = Value.getOperand(0);
  17084. if (Value.getOpcode() == ISD::BITCAST) {
  17085. EVT HalfVT =
  17086. Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
  17087. EVT InVT = Value.getOperand(0).getValueType();
  17088. if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
  17089. unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
  17090. unsigned PgPattern = Mask->getConstantOperandVal(0);
  17091. // Ensure we can double the size of the predicate pattern
  17092. unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
  17093. if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
  17094. MinSVESize) {
  17095. Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
  17096. PgPattern);
  17097. return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
  17098. MST->getBasePtr(), MST->getOffset(), Mask,
  17099. MST->getMemoryVT(), MST->getMemOperand(),
  17100. MST->getAddressingMode(),
  17101. /*IsTruncating=*/true);
  17102. }
  17103. }
  17104. }
  17105. }
  17106. return SDValue();
  17107. }
  17108. /// \return true if part of the index was folded into the Base.
  17109. static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
  17110. SDLoc DL, SelectionDAG &DAG) {
  17111. // This function assumes a vector of i64 indices.
  17112. EVT IndexVT = Index.getValueType();
  17113. if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
  17114. return false;
  17115. // Simplify:
  17116. // BasePtr = Ptr
  17117. // Index = X + splat(Offset)
  17118. // ->
  17119. // BasePtr = Ptr + Offset * scale.
  17120. // Index = X
  17121. if (Index.getOpcode() == ISD::ADD) {
  17122. if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
  17123. Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
  17124. BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
  17125. Index = Index.getOperand(0);
  17126. return true;
  17127. }
  17128. }
  17129. // Simplify:
  17130. // BasePtr = Ptr
  17131. // Index = (X + splat(Offset)) << splat(Shift)
  17132. // ->
  17133. // BasePtr = Ptr + (Offset << Shift) * scale)
  17134. // Index = X << splat(shift)
  17135. if (Index.getOpcode() == ISD::SHL &&
  17136. Index.getOperand(0).getOpcode() == ISD::ADD) {
  17137. SDValue Add = Index.getOperand(0);
  17138. SDValue ShiftOp = Index.getOperand(1);
  17139. SDValue OffsetOp = Add.getOperand(1);
  17140. if (auto Shift = DAG.getSplatValue(ShiftOp))
  17141. if (auto Offset = DAG.getSplatValue(OffsetOp)) {
  17142. Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
  17143. Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
  17144. BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
  17145. Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
  17146. Add.getOperand(0), ShiftOp);
  17147. return true;
  17148. }
  17149. }
  17150. return false;
  17151. }
  17152. // Analyse the specified address returning true if a more optimal addressing
  17153. // mode is available. When returning true all parameters are updated to reflect
  17154. // their recommended values.
  17155. static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N,
  17156. SDValue &BasePtr, SDValue &Index,
  17157. SelectionDAG &DAG) {
  17158. // Try to iteratively fold parts of the index into the base pointer to
  17159. // simplify the index as much as possible.
  17160. bool Changed = false;
  17161. while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
  17162. Changed = true;
  17163. // Only consider element types that are pointer sized as smaller types can
  17164. // be easily promoted.
  17165. EVT IndexVT = Index.getValueType();
  17166. if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
  17167. return Changed;
  17168. // Can indices be trivially shrunk?
  17169. EVT DataVT = N->getOperand(1).getValueType();
  17170. // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
  17171. // will later be re-extended to 64 bits in legalization
  17172. if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
  17173. return Changed;
  17174. if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
  17175. EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
  17176. Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
  17177. return true;
  17178. }
  17179. // Match:
  17180. // Index = step(const)
  17181. int64_t Stride = 0;
  17182. if (Index.getOpcode() == ISD::STEP_VECTOR) {
  17183. Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
  17184. }
  17185. // Match:
  17186. // Index = step(const) << shift(const)
  17187. else if (Index.getOpcode() == ISD::SHL &&
  17188. Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
  17189. SDValue RHS = Index.getOperand(1);
  17190. if (auto *Shift =
  17191. dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
  17192. int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
  17193. Stride = Step << Shift->getZExtValue();
  17194. }
  17195. }
  17196. // Return early because no supported pattern is found.
  17197. if (Stride == 0)
  17198. return Changed;
  17199. if (Stride < std::numeric_limits<int32_t>::min() ||
  17200. Stride > std::numeric_limits<int32_t>::max())
  17201. return Changed;
  17202. const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
  17203. unsigned MaxVScale =
  17204. Subtarget.getMaxSVEVectorSizeInBits() / AArch64::SVEBitsPerBlock;
  17205. int64_t LastElementOffset =
  17206. IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
  17207. if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
  17208. LastElementOffset > std::numeric_limits<int32_t>::max())
  17209. return Changed;
  17210. EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
  17211. // Stride does not scale explicitly by 'Scale', because it happens in
  17212. // the gather/scatter addressing mode.
  17213. Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride));
  17214. return true;
  17215. }
  17216. static SDValue performMaskedGatherScatterCombine(
  17217. SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
  17218. MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
  17219. assert(MGS && "Can only combine gather load or scatter store nodes");
  17220. if (!DCI.isBeforeLegalize())
  17221. return SDValue();
  17222. SDLoc DL(MGS);
  17223. SDValue Chain = MGS->getChain();
  17224. SDValue Scale = MGS->getScale();
  17225. SDValue Index = MGS->getIndex();
  17226. SDValue Mask = MGS->getMask();
  17227. SDValue BasePtr = MGS->getBasePtr();
  17228. ISD::MemIndexType IndexType = MGS->getIndexType();
  17229. if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
  17230. return SDValue();
  17231. // Here we catch such cases early and change MGATHER's IndexType to allow
  17232. // the use of an Index that's more legalisation friendly.
  17233. if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
  17234. SDValue PassThru = MGT->getPassThru();
  17235. SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
  17236. return DAG.getMaskedGather(
  17237. DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
  17238. Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
  17239. }
  17240. auto *MSC = cast<MaskedScatterSDNode>(MGS);
  17241. SDValue Data = MSC->getValue();
  17242. SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
  17243. return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
  17244. Ops, MSC->getMemOperand(), IndexType,
  17245. MSC->isTruncatingStore());
  17246. }
  17247. /// Target-specific DAG combine function for NEON load/store intrinsics
  17248. /// to merge base address updates.
  17249. static SDValue performNEONPostLDSTCombine(SDNode *N,
  17250. TargetLowering::DAGCombinerInfo &DCI,
  17251. SelectionDAG &DAG) {
  17252. if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
  17253. return SDValue();
  17254. unsigned AddrOpIdx = N->getNumOperands() - 1;
  17255. SDValue Addr = N->getOperand(AddrOpIdx);
  17256. // Search for a use of the address operand that is an increment.
  17257. for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
  17258. UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
  17259. SDNode *User = *UI;
  17260. if (User->getOpcode() != ISD::ADD ||
  17261. UI.getUse().getResNo() != Addr.getResNo())
  17262. continue;
  17263. // Check that the add is independent of the load/store. Otherwise, folding
  17264. // it would create a cycle.
  17265. SmallPtrSet<const SDNode *, 32> Visited;
  17266. SmallVector<const SDNode *, 16> Worklist;
  17267. Visited.insert(Addr.getNode());
  17268. Worklist.push_back(N);
  17269. Worklist.push_back(User);
  17270. if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
  17271. SDNode::hasPredecessorHelper(User, Visited, Worklist))
  17272. continue;
  17273. // Find the new opcode for the updating load/store.
  17274. bool IsStore = false;
  17275. bool IsLaneOp = false;
  17276. bool IsDupOp = false;
  17277. unsigned NewOpc = 0;
  17278. unsigned NumVecs = 0;
  17279. unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
  17280. switch (IntNo) {
  17281. default: llvm_unreachable("unexpected intrinsic for Neon base update");
  17282. case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
  17283. NumVecs = 2; break;
  17284. case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
  17285. NumVecs = 3; break;
  17286. case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
  17287. NumVecs = 4; break;
  17288. case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
  17289. NumVecs = 2; IsStore = true; break;
  17290. case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
  17291. NumVecs = 3; IsStore = true; break;
  17292. case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
  17293. NumVecs = 4; IsStore = true; break;
  17294. case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
  17295. NumVecs = 2; break;
  17296. case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
  17297. NumVecs = 3; break;
  17298. case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
  17299. NumVecs = 4; break;
  17300. case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
  17301. NumVecs = 2; IsStore = true; break;
  17302. case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
  17303. NumVecs = 3; IsStore = true; break;
  17304. case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
  17305. NumVecs = 4; IsStore = true; break;
  17306. case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
  17307. NumVecs = 2; IsDupOp = true; break;
  17308. case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
  17309. NumVecs = 3; IsDupOp = true; break;
  17310. case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
  17311. NumVecs = 4; IsDupOp = true; break;
  17312. case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
  17313. NumVecs = 2; IsLaneOp = true; break;
  17314. case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
  17315. NumVecs = 3; IsLaneOp = true; break;
  17316. case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
  17317. NumVecs = 4; IsLaneOp = true; break;
  17318. case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
  17319. NumVecs = 2; IsStore = true; IsLaneOp = true; break;
  17320. case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
  17321. NumVecs = 3; IsStore = true; IsLaneOp = true; break;
  17322. case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
  17323. NumVecs = 4; IsStore = true; IsLaneOp = true; break;
  17324. }
  17325. EVT VecTy;
  17326. if (IsStore)
  17327. VecTy = N->getOperand(2).getValueType();
  17328. else
  17329. VecTy = N->getValueType(0);
  17330. // If the increment is a constant, it must match the memory ref size.
  17331. SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
  17332. if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
  17333. uint32_t IncVal = CInc->getZExtValue();
  17334. unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
  17335. if (IsLaneOp || IsDupOp)
  17336. NumBytes /= VecTy.getVectorNumElements();
  17337. if (IncVal != NumBytes)
  17338. continue;
  17339. Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
  17340. }
  17341. SmallVector<SDValue, 8> Ops;
  17342. Ops.push_back(N->getOperand(0)); // Incoming chain
  17343. // Load lane and store have vector list as input.
  17344. if (IsLaneOp || IsStore)
  17345. for (unsigned i = 2; i < AddrOpIdx; ++i)
  17346. Ops.push_back(N->getOperand(i));
  17347. Ops.push_back(Addr); // Base register
  17348. Ops.push_back(Inc);
  17349. // Return Types.
  17350. EVT Tys[6];
  17351. unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
  17352. unsigned n;
  17353. for (n = 0; n < NumResultVecs; ++n)
  17354. Tys[n] = VecTy;
  17355. Tys[n++] = MVT::i64; // Type of write back register
  17356. Tys[n] = MVT::Other; // Type of the chain
  17357. SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
  17358. MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
  17359. SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
  17360. MemInt->getMemoryVT(),
  17361. MemInt->getMemOperand());
  17362. // Update the uses.
  17363. std::vector<SDValue> NewResults;
  17364. for (unsigned i = 0; i < NumResultVecs; ++i) {
  17365. NewResults.push_back(SDValue(UpdN.getNode(), i));
  17366. }
  17367. NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
  17368. DCI.CombineTo(N, NewResults);
  17369. DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
  17370. break;
  17371. }
  17372. return SDValue();
  17373. }
  17374. // Checks to see if the value is the prescribed width and returns information
  17375. // about its extension mode.
  17376. static
  17377. bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
  17378. ExtType = ISD::NON_EXTLOAD;
  17379. switch(V.getNode()->getOpcode()) {
  17380. default:
  17381. return false;
  17382. case ISD::LOAD: {
  17383. LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
  17384. if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
  17385. || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
  17386. ExtType = LoadNode->getExtensionType();
  17387. return true;
  17388. }
  17389. return false;
  17390. }
  17391. case ISD::AssertSext: {
  17392. VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
  17393. if ((TypeNode->getVT() == MVT::i8 && width == 8)
  17394. || (TypeNode->getVT() == MVT::i16 && width == 16)) {
  17395. ExtType = ISD::SEXTLOAD;
  17396. return true;
  17397. }
  17398. return false;
  17399. }
  17400. case ISD::AssertZext: {
  17401. VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
  17402. if ((TypeNode->getVT() == MVT::i8 && width == 8)
  17403. || (TypeNode->getVT() == MVT::i16 && width == 16)) {
  17404. ExtType = ISD::ZEXTLOAD;
  17405. return true;
  17406. }
  17407. return false;
  17408. }
  17409. case ISD::Constant:
  17410. case ISD::TargetConstant: {
  17411. return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
  17412. 1LL << (width - 1);
  17413. }
  17414. }
  17415. return true;
  17416. }
  17417. // This function does a whole lot of voodoo to determine if the tests are
  17418. // equivalent without and with a mask. Essentially what happens is that given a
  17419. // DAG resembling:
  17420. //
  17421. // +-------------+ +-------------+ +-------------+ +-------------+
  17422. // | Input | | AddConstant | | CompConstant| | CC |
  17423. // +-------------+ +-------------+ +-------------+ +-------------+
  17424. // | | | |
  17425. // V V | +----------+
  17426. // +-------------+ +----+ | |
  17427. // | ADD | |0xff| | |
  17428. // +-------------+ +----+ | |
  17429. // | | | |
  17430. // V V | |
  17431. // +-------------+ | |
  17432. // | AND | | |
  17433. // +-------------+ | |
  17434. // | | |
  17435. // +-----+ | |
  17436. // | | |
  17437. // V V V
  17438. // +-------------+
  17439. // | CMP |
  17440. // +-------------+
  17441. //
  17442. // The AND node may be safely removed for some combinations of inputs. In
  17443. // particular we need to take into account the extension type of the Input,
  17444. // the exact values of AddConstant, CompConstant, and CC, along with the nominal
  17445. // width of the input (this can work for any width inputs, the above graph is
  17446. // specific to 8 bits.
  17447. //
  17448. // The specific equations were worked out by generating output tables for each
  17449. // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
  17450. // problem was simplified by working with 4 bit inputs, which means we only
  17451. // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
  17452. // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
  17453. // patterns present in both extensions (0,7). For every distinct set of
  17454. // AddConstant and CompConstants bit patterns we can consider the masked and
  17455. // unmasked versions to be equivalent if the result of this function is true for
  17456. // all 16 distinct bit patterns of for the current extension type of Input (w0).
  17457. //
  17458. // sub w8, w0, w1
  17459. // and w10, w8, #0x0f
  17460. // cmp w8, w2
  17461. // cset w9, AArch64CC
  17462. // cmp w10, w2
  17463. // cset w11, AArch64CC
  17464. // cmp w9, w11
  17465. // cset w0, eq
  17466. // ret
  17467. //
  17468. // Since the above function shows when the outputs are equivalent it defines
  17469. // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
  17470. // would be expensive to run during compiles. The equations below were written
  17471. // in a test harness that confirmed they gave equivalent outputs to the above
  17472. // for all inputs function, so they can be used determine if the removal is
  17473. // legal instead.
  17474. //
  17475. // isEquivalentMaskless() is the code for testing if the AND can be removed
  17476. // factored out of the DAG recognition as the DAG can take several forms.
  17477. static bool isEquivalentMaskless(unsigned CC, unsigned width,
  17478. ISD::LoadExtType ExtType, int AddConstant,
  17479. int CompConstant) {
  17480. // By being careful about our equations and only writing the in term
  17481. // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
  17482. // make them generally applicable to all bit widths.
  17483. int MaxUInt = (1 << width);
  17484. // For the purposes of these comparisons sign extending the type is
  17485. // equivalent to zero extending the add and displacing it by half the integer
  17486. // width. Provided we are careful and make sure our equations are valid over
  17487. // the whole range we can just adjust the input and avoid writing equations
  17488. // for sign extended inputs.
  17489. if (ExtType == ISD::SEXTLOAD)
  17490. AddConstant -= (1 << (width-1));
  17491. switch(CC) {
  17492. case AArch64CC::LE:
  17493. case AArch64CC::GT:
  17494. if ((AddConstant == 0) ||
  17495. (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
  17496. (AddConstant >= 0 && CompConstant < 0) ||
  17497. (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
  17498. return true;
  17499. break;
  17500. case AArch64CC::LT:
  17501. case AArch64CC::GE:
  17502. if ((AddConstant == 0) ||
  17503. (AddConstant >= 0 && CompConstant <= 0) ||
  17504. (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
  17505. return true;
  17506. break;
  17507. case AArch64CC::HI:
  17508. case AArch64CC::LS:
  17509. if ((AddConstant >= 0 && CompConstant < 0) ||
  17510. (AddConstant <= 0 && CompConstant >= -1 &&
  17511. CompConstant < AddConstant + MaxUInt))
  17512. return true;
  17513. break;
  17514. case AArch64CC::PL:
  17515. case AArch64CC::MI:
  17516. if ((AddConstant == 0) ||
  17517. (AddConstant > 0 && CompConstant <= 0) ||
  17518. (AddConstant < 0 && CompConstant <= AddConstant))
  17519. return true;
  17520. break;
  17521. case AArch64CC::LO:
  17522. case AArch64CC::HS:
  17523. if ((AddConstant >= 0 && CompConstant <= 0) ||
  17524. (AddConstant <= 0 && CompConstant >= 0 &&
  17525. CompConstant <= AddConstant + MaxUInt))
  17526. return true;
  17527. break;
  17528. case AArch64CC::EQ:
  17529. case AArch64CC::NE:
  17530. if ((AddConstant > 0 && CompConstant < 0) ||
  17531. (AddConstant < 0 && CompConstant >= 0 &&
  17532. CompConstant < AddConstant + MaxUInt) ||
  17533. (AddConstant >= 0 && CompConstant >= 0 &&
  17534. CompConstant >= AddConstant) ||
  17535. (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
  17536. return true;
  17537. break;
  17538. case AArch64CC::VS:
  17539. case AArch64CC::VC:
  17540. case AArch64CC::AL:
  17541. case AArch64CC::NV:
  17542. return true;
  17543. case AArch64CC::Invalid:
  17544. break;
  17545. }
  17546. return false;
  17547. }
  17548. // (X & C) >u Mask --> (X & (C & (~Mask)) != 0
  17549. // (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
  17550. static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode,
  17551. SDNode *AndNode, SelectionDAG &DAG,
  17552. unsigned CCIndex, unsigned CmpIndex,
  17553. unsigned CC) {
  17554. ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
  17555. if (!SubsC)
  17556. return SDValue();
  17557. APInt SubsAP = SubsC->getAPIntValue();
  17558. if (CC == AArch64CC::HI) {
  17559. if (!SubsAP.isMask())
  17560. return SDValue();
  17561. } else if (CC == AArch64CC::LO) {
  17562. if (!SubsAP.isPowerOf2())
  17563. return SDValue();
  17564. } else
  17565. return SDValue();
  17566. ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(AndNode->getOperand(1));
  17567. if (!AndC)
  17568. return SDValue();
  17569. APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
  17570. SDLoc DL(N);
  17571. APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
  17572. SDValue ANDS = DAG.getNode(
  17573. AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
  17574. DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
  17575. SDValue AArch64_CC =
  17576. DAG.getConstant(CC == AArch64CC::HI ? AArch64CC::NE : AArch64CC::EQ, DL,
  17577. N->getOperand(CCIndex)->getValueType(0));
  17578. // For now, only performCSELCombine and performBRCONDCombine call this
  17579. // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
  17580. // operands. So just init the ops direct to simplify the code. If we have some
  17581. // other case with different CCIndex, CmpIndex, we need to use for loop to
  17582. // rewrite the code here.
  17583. // TODO: Do we need to assert number of operand is 4 here?
  17584. assert((CCIndex == 2 && CmpIndex == 3) &&
  17585. "Expected CCIndex to be 2 and CmpIndex to be 3.");
  17586. SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
  17587. ANDS.getValue(1)};
  17588. return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
  17589. }
  17590. static
  17591. SDValue performCONDCombine(SDNode *N,
  17592. TargetLowering::DAGCombinerInfo &DCI,
  17593. SelectionDAG &DAG, unsigned CCIndex,
  17594. unsigned CmpIndex) {
  17595. unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
  17596. SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
  17597. unsigned CondOpcode = SubsNode->getOpcode();
  17598. if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0))
  17599. return SDValue();
  17600. // There is a SUBS feeding this condition. Is it fed by a mask we can
  17601. // use?
  17602. SDNode *AndNode = SubsNode->getOperand(0).getNode();
  17603. unsigned MaskBits = 0;
  17604. if (AndNode->getOpcode() != ISD::AND)
  17605. return SDValue();
  17606. if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
  17607. CmpIndex, CC))
  17608. return Val;
  17609. if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
  17610. uint32_t CNV = CN->getZExtValue();
  17611. if (CNV == 255)
  17612. MaskBits = 8;
  17613. else if (CNV == 65535)
  17614. MaskBits = 16;
  17615. }
  17616. if (!MaskBits)
  17617. return SDValue();
  17618. SDValue AddValue = AndNode->getOperand(0);
  17619. if (AddValue.getOpcode() != ISD::ADD)
  17620. return SDValue();
  17621. // The basic dag structure is correct, grab the inputs and validate them.
  17622. SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
  17623. SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
  17624. SDValue SubsInputValue = SubsNode->getOperand(1);
  17625. // The mask is present and the provenance of all the values is a smaller type,
  17626. // lets see if the mask is superfluous.
  17627. if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
  17628. !isa<ConstantSDNode>(SubsInputValue.getNode()))
  17629. return SDValue();
  17630. ISD::LoadExtType ExtType;
  17631. if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
  17632. !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
  17633. !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
  17634. return SDValue();
  17635. if(!isEquivalentMaskless(CC, MaskBits, ExtType,
  17636. cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
  17637. cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
  17638. return SDValue();
  17639. // The AND is not necessary, remove it.
  17640. SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
  17641. SubsNode->getValueType(1));
  17642. SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
  17643. SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
  17644. DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
  17645. return SDValue(N, 0);
  17646. }
  17647. // Optimize compare with zero and branch.
  17648. static SDValue performBRCONDCombine(SDNode *N,
  17649. TargetLowering::DAGCombinerInfo &DCI,
  17650. SelectionDAG &DAG) {
  17651. MachineFunction &MF = DAG.getMachineFunction();
  17652. // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
  17653. // will not be produced, as they are conditional branch instructions that do
  17654. // not set flags.
  17655. if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
  17656. return SDValue();
  17657. if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
  17658. N = NV.getNode();
  17659. SDValue Chain = N->getOperand(0);
  17660. SDValue Dest = N->getOperand(1);
  17661. SDValue CCVal = N->getOperand(2);
  17662. SDValue Cmp = N->getOperand(3);
  17663. assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
  17664. unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
  17665. if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
  17666. return SDValue();
  17667. unsigned CmpOpc = Cmp.getOpcode();
  17668. if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
  17669. return SDValue();
  17670. // Only attempt folding if there is only one use of the flag and no use of the
  17671. // value.
  17672. if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
  17673. return SDValue();
  17674. SDValue LHS = Cmp.getOperand(0);
  17675. SDValue RHS = Cmp.getOperand(1);
  17676. assert(LHS.getValueType() == RHS.getValueType() &&
  17677. "Expected the value type to be the same for both operands!");
  17678. if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
  17679. return SDValue();
  17680. if (isNullConstant(LHS))
  17681. std::swap(LHS, RHS);
  17682. if (!isNullConstant(RHS))
  17683. return SDValue();
  17684. if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
  17685. LHS.getOpcode() == ISD::SRL)
  17686. return SDValue();
  17687. // Fold the compare into the branch instruction.
  17688. SDValue BR;
  17689. if (CC == AArch64CC::EQ)
  17690. BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
  17691. else
  17692. BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
  17693. // Do not add new nodes to DAG combiner worklist.
  17694. DCI.CombineTo(N, BR, false);
  17695. return SDValue();
  17696. }
  17697. static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG) {
  17698. unsigned CC = N->getConstantOperandVal(2);
  17699. SDValue SUBS = N->getOperand(3);
  17700. SDValue Zero, CTTZ;
  17701. if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
  17702. Zero = N->getOperand(0);
  17703. CTTZ = N->getOperand(1);
  17704. } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
  17705. Zero = N->getOperand(1);
  17706. CTTZ = N->getOperand(0);
  17707. } else
  17708. return SDValue();
  17709. if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
  17710. (CTTZ.getOpcode() == ISD::TRUNCATE &&
  17711. CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
  17712. return SDValue();
  17713. assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
  17714. "Illegal type in CTTZ folding");
  17715. if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
  17716. return SDValue();
  17717. SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
  17718. ? CTTZ.getOperand(0).getOperand(0)
  17719. : CTTZ.getOperand(0);
  17720. if (X != SUBS.getOperand(0))
  17721. return SDValue();
  17722. unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
  17723. ? CTTZ.getOperand(0).getValueSizeInBits()
  17724. : CTTZ.getValueSizeInBits();
  17725. SDValue BitWidthMinusOne =
  17726. DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
  17727. return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
  17728. BitWidthMinusOne);
  17729. }
  17730. // (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
  17731. // (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
  17732. // Where x and y are constants and x != y
  17733. // (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
  17734. // (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
  17735. // Where x and y are constants and x != y
  17736. static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG) {
  17737. SDValue L = Op->getOperand(0);
  17738. SDValue R = Op->getOperand(1);
  17739. AArch64CC::CondCode OpCC =
  17740. static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
  17741. SDValue OpCmp = Op->getOperand(3);
  17742. if (!isCMP(OpCmp))
  17743. return SDValue();
  17744. SDValue CmpLHS = OpCmp.getOperand(0);
  17745. SDValue CmpRHS = OpCmp.getOperand(1);
  17746. if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
  17747. std::swap(CmpLHS, CmpRHS);
  17748. else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
  17749. return SDValue();
  17750. SDValue X = CmpLHS->getOperand(0);
  17751. SDValue Y = CmpLHS->getOperand(1);
  17752. if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
  17753. return SDValue();
  17754. }
  17755. // If one of the constant is opaque constant, x,y sdnode is still different
  17756. // but the real value maybe the same. So check APInt here to make sure the
  17757. // code is correct.
  17758. ConstantSDNode *CX = cast<ConstantSDNode>(X);
  17759. ConstantSDNode *CY = cast<ConstantSDNode>(Y);
  17760. if (CX->getAPIntValue() == CY->getAPIntValue())
  17761. return SDValue();
  17762. AArch64CC::CondCode CC =
  17763. static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
  17764. SDValue Cond = CmpLHS->getOperand(3);
  17765. if (CmpRHS == Y)
  17766. CC = AArch64CC::getInvertedCondCode(CC);
  17767. else if (CmpRHS != X)
  17768. return SDValue();
  17769. if (OpCC == AArch64CC::NE)
  17770. CC = AArch64CC::getInvertedCondCode(CC);
  17771. else if (OpCC != AArch64CC::EQ)
  17772. return SDValue();
  17773. SDLoc DL(Op);
  17774. EVT VT = Op->getValueType(0);
  17775. SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32);
  17776. return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
  17777. }
  17778. // Optimize CSEL instructions
  17779. static SDValue performCSELCombine(SDNode *N,
  17780. TargetLowering::DAGCombinerInfo &DCI,
  17781. SelectionDAG &DAG) {
  17782. // CSEL x, x, cc -> x
  17783. if (N->getOperand(0) == N->getOperand(1))
  17784. return N->getOperand(0);
  17785. if (SDValue R = foldCSELOfCSEL(N, DAG))
  17786. return R;
  17787. // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
  17788. // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
  17789. if (SDValue Folded = foldCSELofCTTZ(N, DAG))
  17790. return Folded;
  17791. return performCONDCombine(N, DCI, DAG, 2, 3);
  17792. }
  17793. // Try to re-use an already extended operand of a vector SetCC feeding a
  17794. // extended select. Doing so avoids requiring another full extension of the
  17795. // SET_CC result when lowering the select.
  17796. static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) {
  17797. EVT Op0MVT = Op->getOperand(0).getValueType();
  17798. if (!Op0MVT.isVector() || Op->use_empty())
  17799. return SDValue();
  17800. // Make sure that all uses of Op are VSELECTs with result matching types where
  17801. // the result type has a larger element type than the SetCC operand.
  17802. SDNode *FirstUse = *Op->use_begin();
  17803. if (FirstUse->getOpcode() != ISD::VSELECT)
  17804. return SDValue();
  17805. EVT UseMVT = FirstUse->getValueType(0);
  17806. if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
  17807. return SDValue();
  17808. if (any_of(Op->uses(), [&UseMVT](const SDNode *N) {
  17809. return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
  17810. }))
  17811. return SDValue();
  17812. APInt V;
  17813. if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
  17814. return SDValue();
  17815. SDLoc DL(Op);
  17816. SDValue Op0ExtV;
  17817. SDValue Op1ExtV;
  17818. ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
  17819. // Check if the first operand of the SET_CC is already extended. If it is,
  17820. // split the SET_CC and re-use the extended version of the operand.
  17821. SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
  17822. Op->getOperand(0));
  17823. SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
  17824. Op->getOperand(0));
  17825. if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
  17826. Op0ExtV = SDValue(Op0SExt, 0);
  17827. Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
  17828. } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
  17829. Op0ExtV = SDValue(Op0ZExt, 0);
  17830. Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
  17831. } else
  17832. return SDValue();
  17833. return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
  17834. Op0ExtV, Op1ExtV, Op->getOperand(2));
  17835. }
  17836. static SDValue performSETCCCombine(SDNode *N,
  17837. TargetLowering::DAGCombinerInfo &DCI,
  17838. SelectionDAG &DAG) {
  17839. assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
  17840. SDValue LHS = N->getOperand(0);
  17841. SDValue RHS = N->getOperand(1);
  17842. ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
  17843. SDLoc DL(N);
  17844. EVT VT = N->getValueType(0);
  17845. if (SDValue V = tryToWidenSetCCOperands(N, DAG))
  17846. return V;
  17847. // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
  17848. if (Cond == ISD::SETNE && isOneConstant(RHS) &&
  17849. LHS->getOpcode() == AArch64ISD::CSEL &&
  17850. isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
  17851. LHS->hasOneUse()) {
  17852. // Invert CSEL's condition.
  17853. auto *OpCC = cast<ConstantSDNode>(LHS.getOperand(2));
  17854. auto OldCond = static_cast<AArch64CC::CondCode>(OpCC->getZExtValue());
  17855. auto NewCond = getInvertedCondCode(OldCond);
  17856. // csel 0, 1, !cond, X
  17857. SDValue CSEL =
  17858. DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
  17859. LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
  17860. LHS.getOperand(3));
  17861. return DAG.getZExtOrTrunc(CSEL, DL, VT);
  17862. }
  17863. // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
  17864. if (Cond == ISD::SETNE && isNullConstant(RHS) &&
  17865. LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
  17866. LHS->hasOneUse()) {
  17867. EVT TstVT = LHS->getValueType(0);
  17868. if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
  17869. // this pattern will get better opt in emitComparison
  17870. uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
  17871. SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
  17872. DAG.getConstant(TstImm, DL, TstVT));
  17873. return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
  17874. }
  17875. }
  17876. // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
  17877. // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
  17878. if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
  17879. (Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
  17880. LHS->getOpcode() == ISD::BITCAST) {
  17881. EVT ToVT = LHS->getValueType(0);
  17882. EVT FromVT = LHS->getOperand(0).getValueType();
  17883. if (FromVT.isFixedLengthVector() &&
  17884. FromVT.getVectorElementType() == MVT::i1) {
  17885. LHS = DAG.getNode(ISD::VECREDUCE_OR, DL, MVT::i1, LHS->getOperand(0));
  17886. LHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ToVT, LHS);
  17887. return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
  17888. }
  17889. }
  17890. // Try to perform the memcmp when the result is tested for [in]equality with 0
  17891. if (SDValue V = performOrXorChainCombine(N, DAG))
  17892. return V;
  17893. return SDValue();
  17894. }
  17895. // Replace a flag-setting operator (eg ANDS) with the generic version
  17896. // (eg AND) if the flag is unused.
  17897. static SDValue performFlagSettingCombine(SDNode *N,
  17898. TargetLowering::DAGCombinerInfo &DCI,
  17899. unsigned GenericOpcode) {
  17900. SDLoc DL(N);
  17901. SDValue LHS = N->getOperand(0);
  17902. SDValue RHS = N->getOperand(1);
  17903. EVT VT = N->getValueType(0);
  17904. // If the flag result isn't used, convert back to a generic opcode.
  17905. if (!N->hasAnyUseOfValue(1)) {
  17906. SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
  17907. return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
  17908. DL);
  17909. }
  17910. // Combine identical generic nodes into this node, re-using the result.
  17911. if (SDNode *Generic = DCI.DAG.getNodeIfExists(
  17912. GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
  17913. DCI.CombineTo(Generic, SDValue(N, 0));
  17914. return SDValue();
  17915. }
  17916. static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) {
  17917. // setcc_merge_zero pred
  17918. // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
  17919. // => extract_subvector (inner setcc_merge_zero)
  17920. SDValue Pred = N->getOperand(0);
  17921. SDValue LHS = N->getOperand(1);
  17922. SDValue RHS = N->getOperand(2);
  17923. ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
  17924. if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
  17925. LHS->getOpcode() != ISD::SIGN_EXTEND)
  17926. return SDValue();
  17927. SDValue Extract = LHS->getOperand(0);
  17928. if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
  17929. Extract->getValueType(0) != N->getValueType(0) ||
  17930. Extract->getConstantOperandVal(1) != 0)
  17931. return SDValue();
  17932. SDValue InnerSetCC = Extract->getOperand(0);
  17933. if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
  17934. return SDValue();
  17935. // By this point we've effectively got
  17936. // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
  17937. // lanes are already zero then the trunc(sext()) sequence is redundant and we
  17938. // can operate on A directly.
  17939. SDValue InnerPred = InnerSetCC.getOperand(0);
  17940. if (Pred.getOpcode() == AArch64ISD::PTRUE &&
  17941. InnerPred.getOpcode() == AArch64ISD::PTRUE &&
  17942. Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
  17943. Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
  17944. Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
  17945. return Extract;
  17946. return SDValue();
  17947. }
  17948. static SDValue
  17949. performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
  17950. assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
  17951. "Unexpected opcode!");
  17952. SelectionDAG &DAG = DCI.DAG;
  17953. SDValue Pred = N->getOperand(0);
  17954. SDValue LHS = N->getOperand(1);
  17955. SDValue RHS = N->getOperand(2);
  17956. ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
  17957. if (SDValue V = performSetCCPunpkCombine(N, DAG))
  17958. return V;
  17959. if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
  17960. LHS->getOpcode() == ISD::SIGN_EXTEND &&
  17961. LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
  17962. // setcc_merge_zero(
  17963. // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
  17964. // => setcc_merge_zero(pred, ...)
  17965. if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
  17966. LHS->getOperand(0)->getOperand(0) == Pred)
  17967. return LHS->getOperand(0);
  17968. // setcc_merge_zero(
  17969. // all_active, extend(nxvNi1 ...), != splat(0))
  17970. // -> nxvNi1 ...
  17971. if (isAllActivePredicate(DAG, Pred))
  17972. return LHS->getOperand(0);
  17973. // setcc_merge_zero(
  17974. // pred, extend(nxvNi1 ...), != splat(0))
  17975. // -> nxvNi1 and(pred, ...)
  17976. if (DCI.isAfterLegalizeDAG())
  17977. // Do this after legalization to allow more folds on setcc_merge_zero
  17978. // to be recognized.
  17979. return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
  17980. LHS->getOperand(0), Pred);
  17981. }
  17982. return SDValue();
  17983. }
  17984. // Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
  17985. // as well as whether the test should be inverted. This code is required to
  17986. // catch these cases (as opposed to standard dag combines) because
  17987. // AArch64ISD::TBZ is matched during legalization.
  17988. static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
  17989. SelectionDAG &DAG) {
  17990. if (!Op->hasOneUse())
  17991. return Op;
  17992. // We don't handle undef/constant-fold cases below, as they should have
  17993. // already been taken care of (e.g. and of 0, test of undefined shifted bits,
  17994. // etc.)
  17995. // (tbz (trunc x), b) -> (tbz x, b)
  17996. // This case is just here to enable more of the below cases to be caught.
  17997. if (Op->getOpcode() == ISD::TRUNCATE &&
  17998. Bit < Op->getValueType(0).getSizeInBits()) {
  17999. return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
  18000. }
  18001. // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
  18002. if (Op->getOpcode() == ISD::ANY_EXTEND &&
  18003. Bit < Op->getOperand(0).getValueSizeInBits()) {
  18004. return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
  18005. }
  18006. if (Op->getNumOperands() != 2)
  18007. return Op;
  18008. auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
  18009. if (!C)
  18010. return Op;
  18011. switch (Op->getOpcode()) {
  18012. default:
  18013. return Op;
  18014. // (tbz (and x, m), b) -> (tbz x, b)
  18015. case ISD::AND:
  18016. if ((C->getZExtValue() >> Bit) & 1)
  18017. return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
  18018. return Op;
  18019. // (tbz (shl x, c), b) -> (tbz x, b-c)
  18020. case ISD::SHL:
  18021. if (C->getZExtValue() <= Bit &&
  18022. (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
  18023. Bit = Bit - C->getZExtValue();
  18024. return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
  18025. }
  18026. return Op;
  18027. // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
  18028. case ISD::SRA:
  18029. Bit = Bit + C->getZExtValue();
  18030. if (Bit >= Op->getValueType(0).getSizeInBits())
  18031. Bit = Op->getValueType(0).getSizeInBits() - 1;
  18032. return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
  18033. // (tbz (srl x, c), b) -> (tbz x, b+c)
  18034. case ISD::SRL:
  18035. if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
  18036. Bit = Bit + C->getZExtValue();
  18037. return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
  18038. }
  18039. return Op;
  18040. // (tbz (xor x, -1), b) -> (tbnz x, b)
  18041. case ISD::XOR:
  18042. if ((C->getZExtValue() >> Bit) & 1)
  18043. Invert = !Invert;
  18044. return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
  18045. }
  18046. }
  18047. // Optimize test single bit zero/non-zero and branch.
  18048. static SDValue performTBZCombine(SDNode *N,
  18049. TargetLowering::DAGCombinerInfo &DCI,
  18050. SelectionDAG &DAG) {
  18051. unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
  18052. bool Invert = false;
  18053. SDValue TestSrc = N->getOperand(1);
  18054. SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
  18055. if (TestSrc == NewTestSrc)
  18056. return SDValue();
  18057. unsigned NewOpc = N->getOpcode();
  18058. if (Invert) {
  18059. if (NewOpc == AArch64ISD::TBZ)
  18060. NewOpc = AArch64ISD::TBNZ;
  18061. else {
  18062. assert(NewOpc == AArch64ISD::TBNZ);
  18063. NewOpc = AArch64ISD::TBZ;
  18064. }
  18065. }
  18066. SDLoc DL(N);
  18067. return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
  18068. DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
  18069. }
  18070. // Swap vselect operands where it may allow a predicated operation to achieve
  18071. // the `sel`.
  18072. //
  18073. // (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
  18074. // => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
  18075. static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) {
  18076. auto SelectA = N->getOperand(1);
  18077. auto SelectB = N->getOperand(2);
  18078. auto NTy = N->getValueType(0);
  18079. if (!NTy.isScalableVector())
  18080. return SDValue();
  18081. SDValue SetCC = N->getOperand(0);
  18082. if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
  18083. return SDValue();
  18084. switch (SelectB.getOpcode()) {
  18085. default:
  18086. return SDValue();
  18087. case ISD::FMUL:
  18088. case ISD::FSUB:
  18089. case ISD::FADD:
  18090. break;
  18091. }
  18092. if (SelectA != SelectB.getOperand(0))
  18093. return SDValue();
  18094. ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
  18095. ISD::CondCode InverseCC =
  18096. ISD::getSetCCInverse(CC, SetCC.getOperand(0).getValueType());
  18097. auto InverseSetCC =
  18098. DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
  18099. SetCC.getOperand(1), InverseCC);
  18100. return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
  18101. {InverseSetCC, SelectB, SelectA});
  18102. }
  18103. // vselect (v1i1 setcc) ->
  18104. // vselect (v1iXX setcc) (XX is the size of the compared operand type)
  18105. // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
  18106. // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
  18107. // such VSELECT.
  18108. static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
  18109. if (auto SwapResult = trySwapVSelectOperands(N, DAG))
  18110. return SwapResult;
  18111. SDValue N0 = N->getOperand(0);
  18112. EVT CCVT = N0.getValueType();
  18113. if (isAllActivePredicate(DAG, N0))
  18114. return N->getOperand(1);
  18115. if (isAllInactivePredicate(N0))
  18116. return N->getOperand(2);
  18117. // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
  18118. // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
  18119. // supported types.
  18120. SDValue SetCC = N->getOperand(0);
  18121. if (SetCC.getOpcode() == ISD::SETCC &&
  18122. SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
  18123. SDValue CmpLHS = SetCC.getOperand(0);
  18124. EVT VT = CmpLHS.getValueType();
  18125. SDNode *CmpRHS = SetCC.getOperand(1).getNode();
  18126. SDNode *SplatLHS = N->getOperand(1).getNode();
  18127. SDNode *SplatRHS = N->getOperand(2).getNode();
  18128. APInt SplatLHSVal;
  18129. if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
  18130. VT.isSimple() &&
  18131. is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
  18132. MVT::v2i32, MVT::v4i32, MVT::v2i64}),
  18133. VT.getSimpleVT().SimpleTy) &&
  18134. ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
  18135. SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
  18136. ISD::isConstantSplatVectorAllOnes(SplatRHS)) {
  18137. unsigned NumElts = VT.getVectorNumElements();
  18138. SmallVector<SDValue, 8> Ops(
  18139. NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
  18140. VT.getScalarType()));
  18141. SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
  18142. auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
  18143. auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
  18144. return Or;
  18145. }
  18146. }
  18147. if (N0.getOpcode() != ISD::SETCC ||
  18148. CCVT.getVectorElementCount() != ElementCount::getFixed(1) ||
  18149. CCVT.getVectorElementType() != MVT::i1)
  18150. return SDValue();
  18151. EVT ResVT = N->getValueType(0);
  18152. EVT CmpVT = N0.getOperand(0).getValueType();
  18153. // Only combine when the result type is of the same size as the compared
  18154. // operands.
  18155. if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
  18156. return SDValue();
  18157. SDValue IfTrue = N->getOperand(1);
  18158. SDValue IfFalse = N->getOperand(2);
  18159. SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
  18160. N0.getOperand(0), N0.getOperand(1),
  18161. cast<CondCodeSDNode>(N0.getOperand(2))->get());
  18162. return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
  18163. IfTrue, IfFalse);
  18164. }
  18165. /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
  18166. /// the compare-mask instructions rather than going via NZCV, even if LHS and
  18167. /// RHS are really scalar. This replaces any scalar setcc in the above pattern
  18168. /// with a vector one followed by a DUP shuffle on the result.
  18169. static SDValue performSelectCombine(SDNode *N,
  18170. TargetLowering::DAGCombinerInfo &DCI) {
  18171. SelectionDAG &DAG = DCI.DAG;
  18172. SDValue N0 = N->getOperand(0);
  18173. EVT ResVT = N->getValueType(0);
  18174. if (N0.getOpcode() != ISD::SETCC)
  18175. return SDValue();
  18176. if (ResVT.isScalableVector())
  18177. return SDValue();
  18178. // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
  18179. // scalar SetCCResultType. We also don't expect vectors, because we assume
  18180. // that selects fed by vector SETCCs are canonicalized to VSELECT.
  18181. assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
  18182. "Scalar-SETCC feeding SELECT has unexpected result type!");
  18183. // If NumMaskElts == 0, the comparison is larger than select result. The
  18184. // largest real NEON comparison is 64-bits per lane, which means the result is
  18185. // at most 32-bits and an illegal vector. Just bail out for now.
  18186. EVT SrcVT = N0.getOperand(0).getValueType();
  18187. // Don't try to do this optimization when the setcc itself has i1 operands.
  18188. // There are no legal vectors of i1, so this would be pointless.
  18189. if (SrcVT == MVT::i1)
  18190. return SDValue();
  18191. int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
  18192. if (!ResVT.isVector() || NumMaskElts == 0)
  18193. return SDValue();
  18194. SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
  18195. EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
  18196. // Also bail out if the vector CCVT isn't the same size as ResVT.
  18197. // This can happen if the SETCC operand size doesn't divide the ResVT size
  18198. // (e.g., f64 vs v3f32).
  18199. if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
  18200. return SDValue();
  18201. // Make sure we didn't create illegal types, if we're not supposed to.
  18202. assert(DCI.isBeforeLegalize() ||
  18203. DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
  18204. // First perform a vector comparison, where lane 0 is the one we're interested
  18205. // in.
  18206. SDLoc DL(N0);
  18207. SDValue LHS =
  18208. DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
  18209. SDValue RHS =
  18210. DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
  18211. SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
  18212. // Now duplicate the comparison mask we want across all other lanes.
  18213. SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
  18214. SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
  18215. Mask = DAG.getNode(ISD::BITCAST, DL,
  18216. ResVT.changeVectorElementTypeToInteger(), Mask);
  18217. return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
  18218. }
  18219. static SDValue performDUPCombine(SDNode *N,
  18220. TargetLowering::DAGCombinerInfo &DCI) {
  18221. EVT VT = N->getValueType(0);
  18222. // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
  18223. // 128bit vector version.
  18224. if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
  18225. EVT LVT = VT.getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
  18226. if (SDNode *LN = DCI.DAG.getNodeIfExists(
  18227. N->getOpcode(), DCI.DAG.getVTList(LVT), {N->getOperand(0)})) {
  18228. SDLoc DL(N);
  18229. return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
  18230. DCI.DAG.getConstant(0, DL, MVT::i64));
  18231. }
  18232. }
  18233. return performPostLD1Combine(N, DCI, false);
  18234. }
  18235. /// Get rid of unnecessary NVCASTs (that don't change the type).
  18236. static SDValue performNVCASTCombine(SDNode *N) {
  18237. if (N->getValueType(0) == N->getOperand(0).getValueType())
  18238. return N->getOperand(0);
  18239. return SDValue();
  18240. }
  18241. // If all users of the globaladdr are of the form (globaladdr + constant), find
  18242. // the smallest constant, fold it into the globaladdr's offset and rewrite the
  18243. // globaladdr as (globaladdr + constant) - constant.
  18244. static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
  18245. const AArch64Subtarget *Subtarget,
  18246. const TargetMachine &TM) {
  18247. auto *GN = cast<GlobalAddressSDNode>(N);
  18248. if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
  18249. AArch64II::MO_NO_FLAG)
  18250. return SDValue();
  18251. uint64_t MinOffset = -1ull;
  18252. for (SDNode *N : GN->uses()) {
  18253. if (N->getOpcode() != ISD::ADD)
  18254. return SDValue();
  18255. auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
  18256. if (!C)
  18257. C = dyn_cast<ConstantSDNode>(N->getOperand(1));
  18258. if (!C)
  18259. return SDValue();
  18260. MinOffset = std::min(MinOffset, C->getZExtValue());
  18261. }
  18262. uint64_t Offset = MinOffset + GN->getOffset();
  18263. // Require that the new offset is larger than the existing one. Otherwise, we
  18264. // can end up oscillating between two possible DAGs, for example,
  18265. // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
  18266. if (Offset <= uint64_t(GN->getOffset()))
  18267. return SDValue();
  18268. // Check whether folding this offset is legal. It must not go out of bounds of
  18269. // the referenced object to avoid violating the code model, and must be
  18270. // smaller than 2^20 because this is the largest offset expressible in all
  18271. // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
  18272. // stores an immediate signed 21 bit offset.)
  18273. //
  18274. // This check also prevents us from folding negative offsets, which will end
  18275. // up being treated in the same way as large positive ones. They could also
  18276. // cause code model violations, and aren't really common enough to matter.
  18277. if (Offset >= (1 << 20))
  18278. return SDValue();
  18279. const GlobalValue *GV = GN->getGlobal();
  18280. Type *T = GV->getValueType();
  18281. if (!T->isSized() ||
  18282. Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
  18283. return SDValue();
  18284. SDLoc DL(GN);
  18285. SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
  18286. return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
  18287. DAG.getConstant(MinOffset, DL, MVT::i64));
  18288. }
  18289. static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG,
  18290. const AArch64Subtarget *Subtarget) {
  18291. SDValue BR = N->getOperand(0);
  18292. if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
  18293. !BR.getValueType().isScalarInteger())
  18294. return SDValue();
  18295. SDLoc DL(N);
  18296. return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
  18297. }
  18298. // Turns the vector of indices into a vector of byte offstes by scaling Offset
  18299. // by (BitWidth / 8).
  18300. static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
  18301. SDLoc DL, unsigned BitWidth) {
  18302. assert(Offset.getValueType().isScalableVector() &&
  18303. "This method is only for scalable vectors of offsets");
  18304. SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
  18305. SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
  18306. return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
  18307. }
  18308. /// Check if the value of \p OffsetInBytes can be used as an immediate for
  18309. /// the gather load/prefetch and scatter store instructions with vector base and
  18310. /// immediate offset addressing mode:
  18311. ///
  18312. /// [<Zn>.[S|D]{, #<imm>}]
  18313. ///
  18314. /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
  18315. inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
  18316. unsigned ScalarSizeInBytes) {
  18317. // The immediate is not a multiple of the scalar size.
  18318. if (OffsetInBytes % ScalarSizeInBytes)
  18319. return false;
  18320. // The immediate is out of range.
  18321. if (OffsetInBytes / ScalarSizeInBytes > 31)
  18322. return false;
  18323. return true;
  18324. }
  18325. /// Check if the value of \p Offset represents a valid immediate for the SVE
  18326. /// gather load/prefetch and scatter store instructiona with vector base and
  18327. /// immediate offset addressing mode:
  18328. ///
  18329. /// [<Zn>.[S|D]{, #<imm>}]
  18330. ///
  18331. /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
  18332. static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
  18333. unsigned ScalarSizeInBytes) {
  18334. ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
  18335. return OffsetConst && isValidImmForSVEVecImmAddrMode(
  18336. OffsetConst->getZExtValue(), ScalarSizeInBytes);
  18337. }
  18338. static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
  18339. unsigned Opcode,
  18340. bool OnlyPackedOffsets = true) {
  18341. const SDValue Src = N->getOperand(2);
  18342. const EVT SrcVT = Src->getValueType(0);
  18343. assert(SrcVT.isScalableVector() &&
  18344. "Scatter stores are only possible for SVE vectors");
  18345. SDLoc DL(N);
  18346. MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
  18347. // Make sure that source data will fit into an SVE register
  18348. if (SrcVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
  18349. return SDValue();
  18350. // For FPs, ACLE only supports _packed_ single and double precision types.
  18351. if (SrcElVT.isFloatingPoint())
  18352. if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64))
  18353. return SDValue();
  18354. // Depending on the addressing mode, this is either a pointer or a vector of
  18355. // pointers (that fits into one register)
  18356. SDValue Base = N->getOperand(4);
  18357. // Depending on the addressing mode, this is either a single offset or a
  18358. // vector of offsets (that fits into one register)
  18359. SDValue Offset = N->getOperand(5);
  18360. // For "scalar + vector of indices", just scale the indices. This only
  18361. // applies to non-temporal scatters because there's no instruction that takes
  18362. // indicies.
  18363. if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
  18364. Offset =
  18365. getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
  18366. Opcode = AArch64ISD::SSTNT1_PRED;
  18367. }
  18368. // In the case of non-temporal gather loads there's only one SVE instruction
  18369. // per data-size: "scalar + vector", i.e.
  18370. // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
  18371. // Since we do have intrinsics that allow the arguments to be in a different
  18372. // order, we may need to swap them to match the spec.
  18373. if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector())
  18374. std::swap(Base, Offset);
  18375. // SST1_IMM requires that the offset is an immediate that is:
  18376. // * a multiple of #SizeInBytes,
  18377. // * in the range [0, 31 x #SizeInBytes],
  18378. // where #SizeInBytes is the size in bytes of the stored items. For
  18379. // immediates outside that range and non-immediate scalar offsets use SST1 or
  18380. // SST1_UXTW instead.
  18381. if (Opcode == AArch64ISD::SST1_IMM_PRED) {
  18382. if (!isValidImmForSVEVecImmAddrMode(Offset,
  18383. SrcVT.getScalarSizeInBits() / 8)) {
  18384. if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
  18385. Opcode = AArch64ISD::SST1_UXTW_PRED;
  18386. else
  18387. Opcode = AArch64ISD::SST1_PRED;
  18388. std::swap(Base, Offset);
  18389. }
  18390. }
  18391. auto &TLI = DAG.getTargetLoweringInfo();
  18392. if (!TLI.isTypeLegal(Base.getValueType()))
  18393. return SDValue();
  18394. // Some scatter store variants allow unpacked offsets, but only as nxv2i32
  18395. // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
  18396. // nxv2i64. Legalize accordingly.
  18397. if (!OnlyPackedOffsets &&
  18398. Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
  18399. Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
  18400. if (!TLI.isTypeLegal(Offset.getValueType()))
  18401. return SDValue();
  18402. // Source value type that is representable in hardware
  18403. EVT HwSrcVt = getSVEContainerType(SrcVT);
  18404. // Keep the original type of the input data to store - this is needed to be
  18405. // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
  18406. // FP values we want the integer equivalent, so just use HwSrcVt.
  18407. SDValue InputVT = DAG.getValueType(SrcVT);
  18408. if (SrcVT.isFloatingPoint())
  18409. InputVT = DAG.getValueType(HwSrcVt);
  18410. SDVTList VTs = DAG.getVTList(MVT::Other);
  18411. SDValue SrcNew;
  18412. if (Src.getValueType().isFloatingPoint())
  18413. SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
  18414. else
  18415. SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
  18416. SDValue Ops[] = {N->getOperand(0), // Chain
  18417. SrcNew,
  18418. N->getOperand(3), // Pg
  18419. Base,
  18420. Offset,
  18421. InputVT};
  18422. return DAG.getNode(Opcode, DL, VTs, Ops);
  18423. }
  18424. static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
  18425. unsigned Opcode,
  18426. bool OnlyPackedOffsets = true) {
  18427. const EVT RetVT = N->getValueType(0);
  18428. assert(RetVT.isScalableVector() &&
  18429. "Gather loads are only possible for SVE vectors");
  18430. SDLoc DL(N);
  18431. // Make sure that the loaded data will fit into an SVE register
  18432. if (RetVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
  18433. return SDValue();
  18434. // Depending on the addressing mode, this is either a pointer or a vector of
  18435. // pointers (that fits into one register)
  18436. SDValue Base = N->getOperand(3);
  18437. // Depending on the addressing mode, this is either a single offset or a
  18438. // vector of offsets (that fits into one register)
  18439. SDValue Offset = N->getOperand(4);
  18440. // For "scalar + vector of indices", just scale the indices. This only
  18441. // applies to non-temporal gathers because there's no instruction that takes
  18442. // indicies.
  18443. if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
  18444. Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
  18445. RetVT.getScalarSizeInBits());
  18446. Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
  18447. }
  18448. // In the case of non-temporal gather loads there's only one SVE instruction
  18449. // per data-size: "scalar + vector", i.e.
  18450. // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
  18451. // Since we do have intrinsics that allow the arguments to be in a different
  18452. // order, we may need to swap them to match the spec.
  18453. if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO &&
  18454. Offset.getValueType().isVector())
  18455. std::swap(Base, Offset);
  18456. // GLD{FF}1_IMM requires that the offset is an immediate that is:
  18457. // * a multiple of #SizeInBytes,
  18458. // * in the range [0, 31 x #SizeInBytes],
  18459. // where #SizeInBytes is the size in bytes of the loaded items. For
  18460. // immediates outside that range and non-immediate scalar offsets use
  18461. // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
  18462. if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
  18463. Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
  18464. if (!isValidImmForSVEVecImmAddrMode(Offset,
  18465. RetVT.getScalarSizeInBits() / 8)) {
  18466. if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
  18467. Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
  18468. ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
  18469. : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
  18470. else
  18471. Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
  18472. ? AArch64ISD::GLD1_MERGE_ZERO
  18473. : AArch64ISD::GLDFF1_MERGE_ZERO;
  18474. std::swap(Base, Offset);
  18475. }
  18476. }
  18477. auto &TLI = DAG.getTargetLoweringInfo();
  18478. if (!TLI.isTypeLegal(Base.getValueType()))
  18479. return SDValue();
  18480. // Some gather load variants allow unpacked offsets, but only as nxv2i32
  18481. // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
  18482. // nxv2i64. Legalize accordingly.
  18483. if (!OnlyPackedOffsets &&
  18484. Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
  18485. Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
  18486. // Return value type that is representable in hardware
  18487. EVT HwRetVt = getSVEContainerType(RetVT);
  18488. // Keep the original output value type around - this is needed to be able to
  18489. // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
  18490. // values we want the integer equivalent, so just use HwRetVT.
  18491. SDValue OutVT = DAG.getValueType(RetVT);
  18492. if (RetVT.isFloatingPoint())
  18493. OutVT = DAG.getValueType(HwRetVt);
  18494. SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
  18495. SDValue Ops[] = {N->getOperand(0), // Chain
  18496. N->getOperand(2), // Pg
  18497. Base, Offset, OutVT};
  18498. SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
  18499. SDValue LoadChain = SDValue(Load.getNode(), 1);
  18500. if (RetVT.isInteger() && (RetVT != HwRetVt))
  18501. Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
  18502. // If the original return value was FP, bitcast accordingly. Doing it here
  18503. // means that we can avoid adding TableGen patterns for FPs.
  18504. if (RetVT.isFloatingPoint())
  18505. Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
  18506. return DAG.getMergeValues({Load, LoadChain}, DL);
  18507. }
  18508. static SDValue
  18509. performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
  18510. SelectionDAG &DAG) {
  18511. SDLoc DL(N);
  18512. SDValue Src = N->getOperand(0);
  18513. unsigned Opc = Src->getOpcode();
  18514. // Sign extend of an unsigned unpack -> signed unpack
  18515. if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
  18516. unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
  18517. : AArch64ISD::SUNPKLO;
  18518. // Push the sign extend to the operand of the unpack
  18519. // This is necessary where, for example, the operand of the unpack
  18520. // is another unpack:
  18521. // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
  18522. // ->
  18523. // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
  18524. // ->
  18525. // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
  18526. SDValue ExtOp = Src->getOperand(0);
  18527. auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
  18528. EVT EltTy = VT.getVectorElementType();
  18529. (void)EltTy;
  18530. assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
  18531. "Sign extending from an invalid type");
  18532. EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
  18533. SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),
  18534. ExtOp, DAG.getValueType(ExtVT));
  18535. return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
  18536. }
  18537. if (DCI.isBeforeLegalizeOps())
  18538. return SDValue();
  18539. if (!EnableCombineMGatherIntrinsics)
  18540. return SDValue();
  18541. // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
  18542. // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
  18543. unsigned NewOpc;
  18544. unsigned MemVTOpNum = 4;
  18545. switch (Opc) {
  18546. case AArch64ISD::LD1_MERGE_ZERO:
  18547. NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
  18548. MemVTOpNum = 3;
  18549. break;
  18550. case AArch64ISD::LDNF1_MERGE_ZERO:
  18551. NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
  18552. MemVTOpNum = 3;
  18553. break;
  18554. case AArch64ISD::LDFF1_MERGE_ZERO:
  18555. NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
  18556. MemVTOpNum = 3;
  18557. break;
  18558. case AArch64ISD::GLD1_MERGE_ZERO:
  18559. NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
  18560. break;
  18561. case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
  18562. NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
  18563. break;
  18564. case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
  18565. NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
  18566. break;
  18567. case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
  18568. NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
  18569. break;
  18570. case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
  18571. NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
  18572. break;
  18573. case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
  18574. NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
  18575. break;
  18576. case AArch64ISD::GLD1_IMM_MERGE_ZERO:
  18577. NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
  18578. break;
  18579. case AArch64ISD::GLDFF1_MERGE_ZERO:
  18580. NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
  18581. break;
  18582. case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
  18583. NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
  18584. break;
  18585. case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
  18586. NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
  18587. break;
  18588. case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
  18589. NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
  18590. break;
  18591. case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
  18592. NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
  18593. break;
  18594. case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
  18595. NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
  18596. break;
  18597. case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
  18598. NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
  18599. break;
  18600. case AArch64ISD::GLDNT1_MERGE_ZERO:
  18601. NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
  18602. break;
  18603. default:
  18604. return SDValue();
  18605. }
  18606. EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
  18607. EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
  18608. if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
  18609. return SDValue();
  18610. EVT DstVT = N->getValueType(0);
  18611. SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
  18612. SmallVector<SDValue, 5> Ops;
  18613. for (unsigned I = 0; I < Src->getNumOperands(); ++I)
  18614. Ops.push_back(Src->getOperand(I));
  18615. SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
  18616. DCI.CombineTo(N, ExtLoad);
  18617. DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
  18618. // Return N so it doesn't get rechecked
  18619. return SDValue(N, 0);
  18620. }
  18621. /// Legalize the gather prefetch (scalar + vector addressing mode) when the
  18622. /// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
  18623. /// != nxv2i32) do not need legalization.
  18624. static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
  18625. const unsigned OffsetPos = 4;
  18626. SDValue Offset = N->getOperand(OffsetPos);
  18627. // Not an unpacked vector, bail out.
  18628. if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
  18629. return SDValue();
  18630. // Extend the unpacked offset vector to 64-bit lanes.
  18631. SDLoc DL(N);
  18632. Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
  18633. SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
  18634. // Replace the offset operand with the 64-bit one.
  18635. Ops[OffsetPos] = Offset;
  18636. return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
  18637. }
  18638. /// Combines a node carrying the intrinsic
  18639. /// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
  18640. /// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
  18641. /// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
  18642. /// sve gather prefetch instruction with vector plus immediate addressing mode.
  18643. static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
  18644. unsigned ScalarSizeInBytes) {
  18645. const unsigned ImmPos = 4, OffsetPos = 3;
  18646. // No need to combine the node if the immediate is valid...
  18647. if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
  18648. return SDValue();
  18649. // ...otherwise swap the offset base with the offset...
  18650. SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
  18651. std::swap(Ops[ImmPos], Ops[OffsetPos]);
  18652. // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
  18653. // `aarch64_sve_prfb_gather_uxtw_index`.
  18654. SDLoc DL(N);
  18655. Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
  18656. MVT::i64);
  18657. return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
  18658. }
  18659. // Return true if the vector operation can guarantee only the first lane of its
  18660. // result contains data, with all bits in other lanes set to zero.
  18661. static bool isLanes1toNKnownZero(SDValue Op) {
  18662. switch (Op.getOpcode()) {
  18663. default:
  18664. return false;
  18665. case AArch64ISD::ANDV_PRED:
  18666. case AArch64ISD::EORV_PRED:
  18667. case AArch64ISD::FADDA_PRED:
  18668. case AArch64ISD::FADDV_PRED:
  18669. case AArch64ISD::FMAXNMV_PRED:
  18670. case AArch64ISD::FMAXV_PRED:
  18671. case AArch64ISD::FMINNMV_PRED:
  18672. case AArch64ISD::FMINV_PRED:
  18673. case AArch64ISD::ORV_PRED:
  18674. case AArch64ISD::SADDV_PRED:
  18675. case AArch64ISD::SMAXV_PRED:
  18676. case AArch64ISD::SMINV_PRED:
  18677. case AArch64ISD::UADDV_PRED:
  18678. case AArch64ISD::UMAXV_PRED:
  18679. case AArch64ISD::UMINV_PRED:
  18680. return true;
  18681. }
  18682. }
  18683. static SDValue removeRedundantInsertVectorElt(SDNode *N) {
  18684. assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
  18685. SDValue InsertVec = N->getOperand(0);
  18686. SDValue InsertElt = N->getOperand(1);
  18687. SDValue InsertIdx = N->getOperand(2);
  18688. // We only care about inserts into the first element...
  18689. if (!isNullConstant(InsertIdx))
  18690. return SDValue();
  18691. // ...of a zero'd vector...
  18692. if (!ISD::isConstantSplatVectorAllZeros(InsertVec.getNode()))
  18693. return SDValue();
  18694. // ...where the inserted data was previously extracted...
  18695. if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
  18696. return SDValue();
  18697. SDValue ExtractVec = InsertElt.getOperand(0);
  18698. SDValue ExtractIdx = InsertElt.getOperand(1);
  18699. // ...from the first element of a vector.
  18700. if (!isNullConstant(ExtractIdx))
  18701. return SDValue();
  18702. // If we get here we are effectively trying to zero lanes 1-N of a vector.
  18703. // Ensure there's no type conversion going on.
  18704. if (N->getValueType(0) != ExtractVec.getValueType())
  18705. return SDValue();
  18706. if (!isLanes1toNKnownZero(ExtractVec))
  18707. return SDValue();
  18708. // The explicit zeroing is redundant.
  18709. return ExtractVec;
  18710. }
  18711. static SDValue
  18712. performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
  18713. if (SDValue Res = removeRedundantInsertVectorElt(N))
  18714. return Res;
  18715. return performPostLD1Combine(N, DCI, true);
  18716. }
  18717. static SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) {
  18718. EVT Ty = N->getValueType(0);
  18719. if (Ty.isInteger())
  18720. return SDValue();
  18721. EVT IntTy = Ty.changeVectorElementTypeToInteger();
  18722. EVT ExtIntTy = getPackedSVEVectorVT(IntTy.getVectorElementCount());
  18723. if (ExtIntTy.getVectorElementType().getScalarSizeInBits() <
  18724. IntTy.getVectorElementType().getScalarSizeInBits())
  18725. return SDValue();
  18726. SDLoc DL(N);
  18727. SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)),
  18728. DL, ExtIntTy);
  18729. SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)),
  18730. DL, ExtIntTy);
  18731. SDValue Idx = N->getOperand(2);
  18732. SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx);
  18733. SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy);
  18734. return DAG.getBitcast(Ty, Trunc);
  18735. }
  18736. static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
  18737. TargetLowering::DAGCombinerInfo &DCI,
  18738. const AArch64Subtarget *Subtarget) {
  18739. SDValue N0 = N->getOperand(0);
  18740. EVT VT = N->getValueType(0);
  18741. // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
  18742. if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
  18743. return SDValue();
  18744. auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
  18745. EVT EltVT = VT.getVectorElementType();
  18746. return EltVT == MVT::f32 || EltVT == MVT::f64;
  18747. };
  18748. // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
  18749. // We purposefully don't care about legality of the nodes here as we know
  18750. // they can be split down into something legal.
  18751. if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
  18752. N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
  18753. VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
  18754. VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
  18755. LoadSDNode *LN0 = cast<LoadSDNode>(N0);
  18756. SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
  18757. LN0->getChain(), LN0->getBasePtr(),
  18758. N0.getValueType(), LN0->getMemOperand());
  18759. DCI.CombineTo(N, ExtLoad);
  18760. DCI.CombineTo(
  18761. N0.getNode(),
  18762. DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
  18763. DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
  18764. ExtLoad.getValue(1));
  18765. return SDValue(N, 0); // Return N so it doesn't get rechecked!
  18766. }
  18767. return SDValue();
  18768. }
  18769. static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG,
  18770. const AArch64Subtarget *Subtarget) {
  18771. EVT VT = N->getValueType(0);
  18772. // Don't expand for NEON, SVE2 or SME
  18773. if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
  18774. return SDValue();
  18775. SDLoc DL(N);
  18776. SDValue Mask = N->getOperand(0);
  18777. SDValue In1 = N->getOperand(1);
  18778. SDValue In2 = N->getOperand(2);
  18779. SDValue InvMask = DAG.getNOT(DL, Mask, VT);
  18780. SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
  18781. SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
  18782. return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
  18783. }
  18784. static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) {
  18785. EVT VT = N->getValueType(0);
  18786. SDValue Insert = N->getOperand(0);
  18787. if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
  18788. return SDValue();
  18789. if (!Insert.getOperand(0).isUndef())
  18790. return SDValue();
  18791. uint64_t IdxInsert = Insert.getConstantOperandVal(2);
  18792. uint64_t IdxDupLane = N->getConstantOperandVal(1);
  18793. if (IdxInsert != 0 || IdxDupLane != 0)
  18794. return SDValue();
  18795. SDValue Bitcast = Insert.getOperand(1);
  18796. if (Bitcast.getOpcode() != ISD::BITCAST)
  18797. return SDValue();
  18798. SDValue Subvec = Bitcast.getOperand(0);
  18799. EVT SubvecVT = Subvec.getValueType();
  18800. if (!SubvecVT.is128BitVector())
  18801. return SDValue();
  18802. EVT NewSubvecVT =
  18803. getPackedSVEVectorVT(Subvec.getValueType().getVectorElementType());
  18804. SDLoc DL(N);
  18805. SDValue NewInsert =
  18806. DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
  18807. DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
  18808. SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
  18809. NewInsert, N->getOperand(1));
  18810. return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
  18811. }
  18812. SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
  18813. DAGCombinerInfo &DCI) const {
  18814. SelectionDAG &DAG = DCI.DAG;
  18815. switch (N->getOpcode()) {
  18816. default:
  18817. LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
  18818. break;
  18819. case ISD::ADD:
  18820. case ISD::SUB:
  18821. return performAddSubCombine(N, DCI, DAG);
  18822. case ISD::BUILD_VECTOR:
  18823. return performBuildVectorCombine(N, DCI, DAG);
  18824. case ISD::TRUNCATE:
  18825. return performTruncateCombine(N, DAG);
  18826. case AArch64ISD::ANDS:
  18827. return performFlagSettingCombine(N, DCI, ISD::AND);
  18828. case AArch64ISD::ADC:
  18829. if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
  18830. return R;
  18831. return foldADCToCINC(N, DAG);
  18832. case AArch64ISD::SBC:
  18833. return foldOverflowCheck(N, DAG, /* IsAdd */ false);
  18834. case AArch64ISD::ADCS:
  18835. if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
  18836. return R;
  18837. return performFlagSettingCombine(N, DCI, AArch64ISD::ADC);
  18838. case AArch64ISD::SBCS:
  18839. if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
  18840. return R;
  18841. return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);
  18842. case ISD::XOR:
  18843. return performXorCombine(N, DAG, DCI, Subtarget);
  18844. case ISD::MUL:
  18845. return performMulCombine(N, DAG, DCI, Subtarget);
  18846. case ISD::SINT_TO_FP:
  18847. case ISD::UINT_TO_FP:
  18848. return performIntToFpCombine(N, DAG, Subtarget);
  18849. case ISD::FP_TO_SINT:
  18850. case ISD::FP_TO_UINT:
  18851. case ISD::FP_TO_SINT_SAT:
  18852. case ISD::FP_TO_UINT_SAT:
  18853. return performFpToIntCombine(N, DAG, DCI, Subtarget);
  18854. case ISD::FDIV:
  18855. return performFDivCombine(N, DAG, DCI, Subtarget);
  18856. case ISD::OR:
  18857. return performORCombine(N, DCI, Subtarget, *this);
  18858. case ISD::AND:
  18859. return performANDCombine(N, DCI);
  18860. case ISD::INTRINSIC_WO_CHAIN:
  18861. return performIntrinsicCombine(N, DCI, Subtarget);
  18862. case ISD::ANY_EXTEND:
  18863. case ISD::ZERO_EXTEND:
  18864. case ISD::SIGN_EXTEND:
  18865. return performExtendCombine(N, DCI, DAG);
  18866. case ISD::SIGN_EXTEND_INREG:
  18867. return performSignExtendInRegCombine(N, DCI, DAG);
  18868. case ISD::CONCAT_VECTORS:
  18869. return performConcatVectorsCombine(N, DCI, DAG);
  18870. case ISD::EXTRACT_SUBVECTOR:
  18871. return performExtractSubvectorCombine(N, DCI, DAG);
  18872. case ISD::INSERT_SUBVECTOR:
  18873. return performInsertSubvectorCombine(N, DCI, DAG);
  18874. case ISD::SELECT:
  18875. return performSelectCombine(N, DCI);
  18876. case ISD::VSELECT:
  18877. return performVSelectCombine(N, DCI.DAG);
  18878. case ISD::SETCC:
  18879. return performSETCCCombine(N, DCI, DAG);
  18880. case ISD::LOAD:
  18881. return performLOADCombine(N, DCI, DAG, Subtarget);
  18882. case ISD::STORE:
  18883. return performSTORECombine(N, DCI, DAG, Subtarget);
  18884. case ISD::MSTORE:
  18885. return performMSTORECombine(N, DCI, DAG, Subtarget);
  18886. case ISD::MGATHER:
  18887. case ISD::MSCATTER:
  18888. return performMaskedGatherScatterCombine(N, DCI, DAG);
  18889. case ISD::VECTOR_SPLICE:
  18890. return performSVESpliceCombine(N, DAG);
  18891. case ISD::FP_EXTEND:
  18892. return performFPExtendCombine(N, DAG, DCI, Subtarget);
  18893. case AArch64ISD::BRCOND:
  18894. return performBRCONDCombine(N, DCI, DAG);
  18895. case AArch64ISD::TBNZ:
  18896. case AArch64ISD::TBZ:
  18897. return performTBZCombine(N, DCI, DAG);
  18898. case AArch64ISD::CSEL:
  18899. return performCSELCombine(N, DCI, DAG);
  18900. case AArch64ISD::DUP:
  18901. return performDUPCombine(N, DCI);
  18902. case AArch64ISD::DUPLANE128:
  18903. return performDupLane128Combine(N, DAG);
  18904. case AArch64ISD::NVCAST:
  18905. return performNVCASTCombine(N);
  18906. case AArch64ISD::SPLICE:
  18907. return performSpliceCombine(N, DAG);
  18908. case AArch64ISD::UUNPKLO:
  18909. case AArch64ISD::UUNPKHI:
  18910. return performUnpackCombine(N, DAG, Subtarget);
  18911. case AArch64ISD::UZP1:
  18912. return performUzpCombine(N, DAG);
  18913. case AArch64ISD::SETCC_MERGE_ZERO:
  18914. return performSetccMergeZeroCombine(N, DCI);
  18915. case AArch64ISD::REINTERPRET_CAST:
  18916. return performReinterpretCastCombine(N);
  18917. case AArch64ISD::GLD1_MERGE_ZERO:
  18918. case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
  18919. case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
  18920. case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
  18921. case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
  18922. case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
  18923. case AArch64ISD::GLD1_IMM_MERGE_ZERO:
  18924. case AArch64ISD::GLD1S_MERGE_ZERO:
  18925. case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
  18926. case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
  18927. case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
  18928. case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
  18929. case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
  18930. case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
  18931. return performGLD1Combine(N, DAG);
  18932. case AArch64ISD::VASHR:
  18933. case AArch64ISD::VLSHR:
  18934. return performVectorShiftCombine(N, *this, DCI);
  18935. case AArch64ISD::SUNPKLO:
  18936. return performSunpkloCombine(N, DAG);
  18937. case AArch64ISD::BSP:
  18938. return performBSPExpandForSVE(N, DAG, Subtarget);
  18939. case ISD::INSERT_VECTOR_ELT:
  18940. return performInsertVectorEltCombine(N, DCI);
  18941. case ISD::EXTRACT_VECTOR_ELT:
  18942. return performExtractVectorEltCombine(N, DCI, Subtarget);
  18943. case ISD::VECREDUCE_ADD:
  18944. return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
  18945. case AArch64ISD::UADDV:
  18946. return performUADDVCombine(N, DAG);
  18947. case AArch64ISD::SMULL:
  18948. case AArch64ISD::UMULL:
  18949. case AArch64ISD::PMULL:
  18950. return tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG);
  18951. case ISD::INTRINSIC_VOID:
  18952. case ISD::INTRINSIC_W_CHAIN:
  18953. switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
  18954. case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
  18955. return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
  18956. case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
  18957. return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
  18958. case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
  18959. return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
  18960. case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
  18961. return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
  18962. case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
  18963. case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
  18964. case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
  18965. case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
  18966. case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
  18967. case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
  18968. case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
  18969. case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
  18970. return legalizeSVEGatherPrefetchOffsVec(N, DAG);
  18971. case Intrinsic::aarch64_neon_ld2:
  18972. case Intrinsic::aarch64_neon_ld3:
  18973. case Intrinsic::aarch64_neon_ld4:
  18974. case Intrinsic::aarch64_neon_ld1x2:
  18975. case Intrinsic::aarch64_neon_ld1x3:
  18976. case Intrinsic::aarch64_neon_ld1x4:
  18977. case Intrinsic::aarch64_neon_ld2lane:
  18978. case Intrinsic::aarch64_neon_ld3lane:
  18979. case Intrinsic::aarch64_neon_ld4lane:
  18980. case Intrinsic::aarch64_neon_ld2r:
  18981. case Intrinsic::aarch64_neon_ld3r:
  18982. case Intrinsic::aarch64_neon_ld4r:
  18983. case Intrinsic::aarch64_neon_st2:
  18984. case Intrinsic::aarch64_neon_st3:
  18985. case Intrinsic::aarch64_neon_st4:
  18986. case Intrinsic::aarch64_neon_st1x2:
  18987. case Intrinsic::aarch64_neon_st1x3:
  18988. case Intrinsic::aarch64_neon_st1x4:
  18989. case Intrinsic::aarch64_neon_st2lane:
  18990. case Intrinsic::aarch64_neon_st3lane:
  18991. case Intrinsic::aarch64_neon_st4lane:
  18992. return performNEONPostLDSTCombine(N, DCI, DAG);
  18993. case Intrinsic::aarch64_sve_ldnt1:
  18994. return performLDNT1Combine(N, DAG);
  18995. case Intrinsic::aarch64_sve_ld1rq:
  18996. return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
  18997. case Intrinsic::aarch64_sve_ld1ro:
  18998. return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
  18999. case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
  19000. return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
  19001. case Intrinsic::aarch64_sve_ldnt1_gather:
  19002. return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
  19003. case Intrinsic::aarch64_sve_ldnt1_gather_index:
  19004. return performGatherLoadCombine(N, DAG,
  19005. AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
  19006. case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
  19007. return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
  19008. case Intrinsic::aarch64_sve_ld1:
  19009. return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
  19010. case Intrinsic::aarch64_sve_ldnf1:
  19011. return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
  19012. case Intrinsic::aarch64_sve_ldff1:
  19013. return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
  19014. case Intrinsic::aarch64_sve_st1:
  19015. return performST1Combine(N, DAG);
  19016. case Intrinsic::aarch64_sve_stnt1:
  19017. return performSTNT1Combine(N, DAG);
  19018. case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
  19019. return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
  19020. case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
  19021. return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
  19022. case Intrinsic::aarch64_sve_stnt1_scatter:
  19023. return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
  19024. case Intrinsic::aarch64_sve_stnt1_scatter_index:
  19025. return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
  19026. case Intrinsic::aarch64_sve_ld1_gather:
  19027. return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
  19028. case Intrinsic::aarch64_sve_ld1_gather_index:
  19029. return performGatherLoadCombine(N, DAG,
  19030. AArch64ISD::GLD1_SCALED_MERGE_ZERO);
  19031. case Intrinsic::aarch64_sve_ld1_gather_sxtw:
  19032. return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
  19033. /*OnlyPackedOffsets=*/false);
  19034. case Intrinsic::aarch64_sve_ld1_gather_uxtw:
  19035. return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
  19036. /*OnlyPackedOffsets=*/false);
  19037. case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
  19038. return performGatherLoadCombine(N, DAG,
  19039. AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
  19040. /*OnlyPackedOffsets=*/false);
  19041. case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
  19042. return performGatherLoadCombine(N, DAG,
  19043. AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
  19044. /*OnlyPackedOffsets=*/false);
  19045. case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
  19046. return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
  19047. case Intrinsic::aarch64_sve_ldff1_gather:
  19048. return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
  19049. case Intrinsic::aarch64_sve_ldff1_gather_index:
  19050. return performGatherLoadCombine(N, DAG,
  19051. AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
  19052. case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
  19053. return performGatherLoadCombine(N, DAG,
  19054. AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
  19055. /*OnlyPackedOffsets=*/false);
  19056. case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
  19057. return performGatherLoadCombine(N, DAG,
  19058. AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
  19059. /*OnlyPackedOffsets=*/false);
  19060. case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
  19061. return performGatherLoadCombine(N, DAG,
  19062. AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
  19063. /*OnlyPackedOffsets=*/false);
  19064. case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
  19065. return performGatherLoadCombine(N, DAG,
  19066. AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
  19067. /*OnlyPackedOffsets=*/false);
  19068. case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
  19069. return performGatherLoadCombine(N, DAG,
  19070. AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
  19071. case Intrinsic::aarch64_sve_st1_scatter:
  19072. return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
  19073. case Intrinsic::aarch64_sve_st1_scatter_index:
  19074. return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
  19075. case Intrinsic::aarch64_sve_st1_scatter_sxtw:
  19076. return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
  19077. /*OnlyPackedOffsets=*/false);
  19078. case Intrinsic::aarch64_sve_st1_scatter_uxtw:
  19079. return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
  19080. /*OnlyPackedOffsets=*/false);
  19081. case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
  19082. return performScatterStoreCombine(N, DAG,
  19083. AArch64ISD::SST1_SXTW_SCALED_PRED,
  19084. /*OnlyPackedOffsets=*/false);
  19085. case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
  19086. return performScatterStoreCombine(N, DAG,
  19087. AArch64ISD::SST1_UXTW_SCALED_PRED,
  19088. /*OnlyPackedOffsets=*/false);
  19089. case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
  19090. return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
  19091. case Intrinsic::aarch64_rndr:
  19092. case Intrinsic::aarch64_rndrrs: {
  19093. unsigned IntrinsicID =
  19094. cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
  19095. auto Register =
  19096. (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
  19097. : AArch64SysReg::RNDRRS);
  19098. SDLoc DL(N);
  19099. SDValue A = DAG.getNode(
  19100. AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
  19101. N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
  19102. SDValue B = DAG.getNode(
  19103. AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
  19104. DAG.getConstant(0, DL, MVT::i32),
  19105. DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
  19106. return DAG.getMergeValues(
  19107. {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
  19108. }
  19109. default:
  19110. break;
  19111. }
  19112. break;
  19113. case ISD::GlobalAddress:
  19114. return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
  19115. case ISD::CTLZ:
  19116. return performCTLZCombine(N, DAG, Subtarget);
  19117. }
  19118. return SDValue();
  19119. }
  19120. // Check if the return value is used as only a return value, as otherwise
  19121. // we can't perform a tail-call. In particular, we need to check for
  19122. // target ISD nodes that are returns and any other "odd" constructs
  19123. // that the generic analysis code won't necessarily catch.
  19124. bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
  19125. SDValue &Chain) const {
  19126. if (N->getNumValues() != 1)
  19127. return false;
  19128. if (!N->hasNUsesOfValue(1, 0))
  19129. return false;
  19130. SDValue TCChain = Chain;
  19131. SDNode *Copy = *N->use_begin();
  19132. if (Copy->getOpcode() == ISD::CopyToReg) {
  19133. // If the copy has a glue operand, we conservatively assume it isn't safe to
  19134. // perform a tail call.
  19135. if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
  19136. MVT::Glue)
  19137. return false;
  19138. TCChain = Copy->getOperand(0);
  19139. } else if (Copy->getOpcode() != ISD::FP_EXTEND)
  19140. return false;
  19141. bool HasRet = false;
  19142. for (SDNode *Node : Copy->uses()) {
  19143. if (Node->getOpcode() != AArch64ISD::RET_FLAG)
  19144. return false;
  19145. HasRet = true;
  19146. }
  19147. if (!HasRet)
  19148. return false;
  19149. Chain = TCChain;
  19150. return true;
  19151. }
  19152. // Return whether the an instruction can potentially be optimized to a tail
  19153. // call. This will cause the optimizers to attempt to move, or duplicate,
  19154. // return instructions to help enable tail call optimizations for this
  19155. // instruction.
  19156. bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
  19157. return CI->isTailCall();
  19158. }
  19159. bool AArch64TargetLowering::getIndexedAddressParts(
  19160. SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
  19161. ISD::MemIndexedMode &AM, bool &IsInc, SelectionDAG &DAG) const {
  19162. if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
  19163. return false;
  19164. // Non-null if there is exactly one user of the loaded value (ignoring chain).
  19165. SDNode *ValOnlyUser = nullptr;
  19166. for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
  19167. ++UI) {
  19168. if (UI.getUse().getResNo() == 1)
  19169. continue; // Ignore chain.
  19170. if (ValOnlyUser == nullptr)
  19171. ValOnlyUser = *UI;
  19172. else {
  19173. ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
  19174. break;
  19175. }
  19176. }
  19177. auto IsUndefOrZero = [](SDValue V) {
  19178. return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
  19179. };
  19180. // If the only user of the value is a scalable vector splat, it is
  19181. // preferable to do a replicating load (ld1r*).
  19182. if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
  19183. (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
  19184. (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
  19185. IsUndefOrZero(ValOnlyUser->getOperand(2)))))
  19186. return false;
  19187. Base = Op->getOperand(0);
  19188. // All of the indexed addressing mode instructions take a signed
  19189. // 9 bit immediate offset.
  19190. if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
  19191. int64_t RHSC = RHS->getSExtValue();
  19192. if (Op->getOpcode() == ISD::SUB)
  19193. RHSC = -(uint64_t)RHSC;
  19194. if (!isInt<9>(RHSC))
  19195. return false;
  19196. IsInc = (Op->getOpcode() == ISD::ADD);
  19197. Offset = Op->getOperand(1);
  19198. return true;
  19199. }
  19200. return false;
  19201. }
  19202. bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
  19203. SDValue &Offset,
  19204. ISD::MemIndexedMode &AM,
  19205. SelectionDAG &DAG) const {
  19206. EVT VT;
  19207. SDValue Ptr;
  19208. if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
  19209. VT = LD->getMemoryVT();
  19210. Ptr = LD->getBasePtr();
  19211. } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
  19212. VT = ST->getMemoryVT();
  19213. Ptr = ST->getBasePtr();
  19214. } else
  19215. return false;
  19216. bool IsInc;
  19217. if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
  19218. return false;
  19219. AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
  19220. return true;
  19221. }
  19222. bool AArch64TargetLowering::getPostIndexedAddressParts(
  19223. SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
  19224. ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
  19225. EVT VT;
  19226. SDValue Ptr;
  19227. if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
  19228. VT = LD->getMemoryVT();
  19229. Ptr = LD->getBasePtr();
  19230. } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
  19231. VT = ST->getMemoryVT();
  19232. Ptr = ST->getBasePtr();
  19233. } else
  19234. return false;
  19235. bool IsInc;
  19236. if (!getIndexedAddressParts(N, Op, Base, Offset, AM, IsInc, DAG))
  19237. return false;
  19238. // Post-indexing updates the base, so it's not a valid transform
  19239. // if that's not the same as the load's pointer.
  19240. if (Ptr != Base)
  19241. return false;
  19242. AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
  19243. return true;
  19244. }
  19245. void AArch64TargetLowering::ReplaceBITCASTResults(
  19246. SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
  19247. SDLoc DL(N);
  19248. SDValue Op = N->getOperand(0);
  19249. EVT VT = N->getValueType(0);
  19250. EVT SrcVT = Op.getValueType();
  19251. if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
  19252. assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
  19253. "Expected fp->int bitcast!");
  19254. // Bitcasting between unpacked vector types of different element counts is
  19255. // not a NOP because the live elements are laid out differently.
  19256. // 01234567
  19257. // e.g. nxv2i32 = XX??XX??
  19258. // nxv4f16 = X?X?X?X?
  19259. if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
  19260. return;
  19261. SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
  19262. Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
  19263. return;
  19264. }
  19265. if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
  19266. return;
  19267. Op = SDValue(
  19268. DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
  19269. DAG.getUNDEF(MVT::i32), Op,
  19270. DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
  19271. 0);
  19272. Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
  19273. Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
  19274. }
  19275. static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl<SDValue> &Results,
  19276. SelectionDAG &DAG,
  19277. const AArch64Subtarget *Subtarget) {
  19278. EVT VT = N->getValueType(0);
  19279. if (!VT.is256BitVector() ||
  19280. (VT.getScalarType().isFloatingPoint() &&
  19281. !N->getFlags().hasAllowReassociation()) ||
  19282. (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()))
  19283. return;
  19284. SDValue X = N->getOperand(0);
  19285. auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
  19286. if (!Shuf) {
  19287. Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
  19288. X = N->getOperand(1);
  19289. if (!Shuf)
  19290. return;
  19291. }
  19292. if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
  19293. return;
  19294. // Check the mask is 1,0,3,2,5,4,...
  19295. ArrayRef<int> Mask = Shuf->getMask();
  19296. for (int I = 0, E = Mask.size(); I < E; I++)
  19297. if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
  19298. return;
  19299. SDLoc DL(N);
  19300. auto LoHi = DAG.SplitVector(X, DL);
  19301. assert(LoHi.first.getValueType() == LoHi.second.getValueType());
  19302. SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
  19303. LoHi.first, LoHi.second);
  19304. // Shuffle the elements back into order.
  19305. SmallVector<int> NMask;
  19306. for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
  19307. NMask.push_back(I);
  19308. NMask.push_back(I);
  19309. }
  19310. Results.push_back(
  19311. DAG.getVectorShuffle(VT, DL,
  19312. DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
  19313. DAG.getUNDEF(LoHi.first.getValueType())),
  19314. DAG.getUNDEF(VT), NMask));
  19315. }
  19316. static void ReplaceReductionResults(SDNode *N,
  19317. SmallVectorImpl<SDValue> &Results,
  19318. SelectionDAG &DAG, unsigned InterOp,
  19319. unsigned AcrossOp) {
  19320. EVT LoVT, HiVT;
  19321. SDValue Lo, Hi;
  19322. SDLoc dl(N);
  19323. std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
  19324. std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
  19325. SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
  19326. SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
  19327. Results.push_back(SplitVal);
  19328. }
  19329. static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
  19330. SDLoc DL(N);
  19331. SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N);
  19332. SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64,
  19333. DAG.getNode(ISD::SRL, DL, MVT::i128, N,
  19334. DAG.getConstant(64, DL, MVT::i64)));
  19335. return std::make_pair(Lo, Hi);
  19336. }
  19337. void AArch64TargetLowering::ReplaceExtractSubVectorResults(
  19338. SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
  19339. SDValue In = N->getOperand(0);
  19340. EVT InVT = In.getValueType();
  19341. // Common code will handle these just fine.
  19342. if (!InVT.isScalableVector() || !InVT.isInteger())
  19343. return;
  19344. SDLoc DL(N);
  19345. EVT VT = N->getValueType(0);
  19346. // The following checks bail if this is not a halving operation.
  19347. ElementCount ResEC = VT.getVectorElementCount();
  19348. if (InVT.getVectorElementCount() != (ResEC * 2))
  19349. return;
  19350. auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
  19351. if (!CIndex)
  19352. return;
  19353. unsigned Index = CIndex->getZExtValue();
  19354. if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
  19355. return;
  19356. unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
  19357. EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
  19358. SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
  19359. Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
  19360. }
  19361. // Create an even/odd pair of X registers holding integer value V.
  19362. static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
  19363. SDLoc dl(V.getNode());
  19364. SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64);
  19365. SDValue VHi = DAG.getAnyExtOrTrunc(
  19366. DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)),
  19367. dl, MVT::i64);
  19368. if (DAG.getDataLayout().isBigEndian())
  19369. std::swap (VLo, VHi);
  19370. SDValue RegClass =
  19371. DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
  19372. SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
  19373. SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
  19374. const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
  19375. return SDValue(
  19376. DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
  19377. }
  19378. static void ReplaceCMP_SWAP_128Results(SDNode *N,
  19379. SmallVectorImpl<SDValue> &Results,
  19380. SelectionDAG &DAG,
  19381. const AArch64Subtarget *Subtarget) {
  19382. assert(N->getValueType(0) == MVT::i128 &&
  19383. "AtomicCmpSwap on types less than 128 should be legal");
  19384. MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
  19385. if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
  19386. // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
  19387. // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
  19388. SDValue Ops[] = {
  19389. createGPRPairNode(DAG, N->getOperand(2)), // Compare value
  19390. createGPRPairNode(DAG, N->getOperand(3)), // Store value
  19391. N->getOperand(1), // Ptr
  19392. N->getOperand(0), // Chain in
  19393. };
  19394. unsigned Opcode;
  19395. switch (MemOp->getMergedOrdering()) {
  19396. case AtomicOrdering::Monotonic:
  19397. Opcode = AArch64::CASPX;
  19398. break;
  19399. case AtomicOrdering::Acquire:
  19400. Opcode = AArch64::CASPAX;
  19401. break;
  19402. case AtomicOrdering::Release:
  19403. Opcode = AArch64::CASPLX;
  19404. break;
  19405. case AtomicOrdering::AcquireRelease:
  19406. case AtomicOrdering::SequentiallyConsistent:
  19407. Opcode = AArch64::CASPALX;
  19408. break;
  19409. default:
  19410. llvm_unreachable("Unexpected ordering!");
  19411. }
  19412. MachineSDNode *CmpSwap = DAG.getMachineNode(
  19413. Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
  19414. DAG.setNodeMemRefs(CmpSwap, {MemOp});
  19415. unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
  19416. if (DAG.getDataLayout().isBigEndian())
  19417. std::swap(SubReg1, SubReg2);
  19418. SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
  19419. SDValue(CmpSwap, 0));
  19420. SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
  19421. SDValue(CmpSwap, 0));
  19422. Results.push_back(
  19423. DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
  19424. Results.push_back(SDValue(CmpSwap, 1)); // Chain out
  19425. return;
  19426. }
  19427. unsigned Opcode;
  19428. switch (MemOp->getMergedOrdering()) {
  19429. case AtomicOrdering::Monotonic:
  19430. Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
  19431. break;
  19432. case AtomicOrdering::Acquire:
  19433. Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
  19434. break;
  19435. case AtomicOrdering::Release:
  19436. Opcode = AArch64::CMP_SWAP_128_RELEASE;
  19437. break;
  19438. case AtomicOrdering::AcquireRelease:
  19439. case AtomicOrdering::SequentiallyConsistent:
  19440. Opcode = AArch64::CMP_SWAP_128;
  19441. break;
  19442. default:
  19443. llvm_unreachable("Unexpected ordering!");
  19444. }
  19445. auto Desired = splitInt128(N->getOperand(2), DAG);
  19446. auto New = splitInt128(N->getOperand(3), DAG);
  19447. SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
  19448. New.first, New.second, N->getOperand(0)};
  19449. SDNode *CmpSwap = DAG.getMachineNode(
  19450. Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
  19451. Ops);
  19452. DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
  19453. Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
  19454. SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
  19455. Results.push_back(SDValue(CmpSwap, 3));
  19456. }
  19457. void AArch64TargetLowering::ReplaceNodeResults(
  19458. SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
  19459. switch (N->getOpcode()) {
  19460. default:
  19461. llvm_unreachable("Don't know how to custom expand this");
  19462. case ISD::BITCAST:
  19463. ReplaceBITCASTResults(N, Results, DAG);
  19464. return;
  19465. case ISD::VECREDUCE_ADD:
  19466. case ISD::VECREDUCE_SMAX:
  19467. case ISD::VECREDUCE_SMIN:
  19468. case ISD::VECREDUCE_UMAX:
  19469. case ISD::VECREDUCE_UMIN:
  19470. Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
  19471. return;
  19472. case ISD::ADD:
  19473. case ISD::FADD:
  19474. ReplaceAddWithADDP(N, Results, DAG, Subtarget);
  19475. return;
  19476. case ISD::CTPOP:
  19477. case ISD::PARITY:
  19478. if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
  19479. Results.push_back(Result);
  19480. return;
  19481. case AArch64ISD::SADDV:
  19482. ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
  19483. return;
  19484. case AArch64ISD::UADDV:
  19485. ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
  19486. return;
  19487. case AArch64ISD::SMINV:
  19488. ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
  19489. return;
  19490. case AArch64ISD::UMINV:
  19491. ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
  19492. return;
  19493. case AArch64ISD::SMAXV:
  19494. ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
  19495. return;
  19496. case AArch64ISD::UMAXV:
  19497. ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
  19498. return;
  19499. case ISD::FP_TO_UINT:
  19500. case ISD::FP_TO_SINT:
  19501. case ISD::STRICT_FP_TO_SINT:
  19502. case ISD::STRICT_FP_TO_UINT:
  19503. assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
  19504. // Let normal code take care of it by not adding anything to Results.
  19505. return;
  19506. case ISD::ATOMIC_CMP_SWAP:
  19507. ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
  19508. return;
  19509. case ISD::ATOMIC_LOAD:
  19510. case ISD::LOAD: {
  19511. MemSDNode *LoadNode = cast<MemSDNode>(N);
  19512. EVT MemVT = LoadNode->getMemoryVT();
  19513. // Handle lowering 256 bit non temporal loads into LDNP for little-endian
  19514. // targets.
  19515. if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
  19516. MemVT.getSizeInBits() == 256u &&
  19517. (MemVT.getScalarSizeInBits() == 8u ||
  19518. MemVT.getScalarSizeInBits() == 16u ||
  19519. MemVT.getScalarSizeInBits() == 32u ||
  19520. MemVT.getScalarSizeInBits() == 64u)) {
  19521. SDValue Result = DAG.getMemIntrinsicNode(
  19522. AArch64ISD::LDNP, SDLoc(N),
  19523. DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
  19524. MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
  19525. MVT::Other}),
  19526. {LoadNode->getChain(), LoadNode->getBasePtr()},
  19527. LoadNode->getMemoryVT(), LoadNode->getMemOperand());
  19528. SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
  19529. Result.getValue(0), Result.getValue(1));
  19530. Results.append({Pair, Result.getValue(2) /* Chain */});
  19531. return;
  19532. }
  19533. if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
  19534. LoadNode->getMemoryVT() != MVT::i128) {
  19535. // Non-volatile or atomic loads are optimized later in AArch64's load/store
  19536. // optimizer.
  19537. return;
  19538. }
  19539. if (SDValue(N, 0).getValueType() == MVT::i128) {
  19540. SDValue Result = DAG.getMemIntrinsicNode(
  19541. AArch64ISD::LDP, SDLoc(N),
  19542. DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
  19543. {LoadNode->getChain(), LoadNode->getBasePtr()},
  19544. LoadNode->getMemoryVT(), LoadNode->getMemOperand());
  19545. SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
  19546. Result.getValue(0), Result.getValue(1));
  19547. Results.append({Pair, Result.getValue(2) /* Chain */});
  19548. }
  19549. return;
  19550. }
  19551. case ISD::EXTRACT_SUBVECTOR:
  19552. ReplaceExtractSubVectorResults(N, Results, DAG);
  19553. return;
  19554. case ISD::INSERT_SUBVECTOR:
  19555. case ISD::CONCAT_VECTORS:
  19556. // Custom lowering has been requested for INSERT_SUBVECTOR and
  19557. // CONCAT_VECTORS -- but delegate to common code for result type
  19558. // legalisation
  19559. return;
  19560. case ISD::INTRINSIC_WO_CHAIN: {
  19561. EVT VT = N->getValueType(0);
  19562. assert((VT == MVT::i8 || VT == MVT::i16) &&
  19563. "custom lowering for unexpected type");
  19564. ConstantSDNode *CN = cast<ConstantSDNode>(N->getOperand(0));
  19565. Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
  19566. switch (IntID) {
  19567. default:
  19568. return;
  19569. case Intrinsic::aarch64_sve_clasta_n: {
  19570. SDLoc DL(N);
  19571. auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
  19572. auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
  19573. N->getOperand(1), Op2, N->getOperand(3));
  19574. Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
  19575. return;
  19576. }
  19577. case Intrinsic::aarch64_sve_clastb_n: {
  19578. SDLoc DL(N);
  19579. auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
  19580. auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
  19581. N->getOperand(1), Op2, N->getOperand(3));
  19582. Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
  19583. return;
  19584. }
  19585. case Intrinsic::aarch64_sve_lasta: {
  19586. SDLoc DL(N);
  19587. auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
  19588. N->getOperand(1), N->getOperand(2));
  19589. Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
  19590. return;
  19591. }
  19592. case Intrinsic::aarch64_sve_lastb: {
  19593. SDLoc DL(N);
  19594. auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
  19595. N->getOperand(1), N->getOperand(2));
  19596. Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
  19597. return;
  19598. }
  19599. }
  19600. }
  19601. case ISD::READ_REGISTER: {
  19602. SDLoc DL(N);
  19603. assert(N->getValueType(0) == MVT::i128 &&
  19604. "READ_REGISTER custom lowering is only for 128-bit sysregs");
  19605. SDValue Chain = N->getOperand(0);
  19606. SDValue SysRegName = N->getOperand(1);
  19607. SDValue Result = DAG.getNode(
  19608. AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
  19609. Chain, SysRegName);
  19610. // Sysregs are not endian. Result.getValue(0) always contains the lower half
  19611. // of the 128-bit System Register value.
  19612. SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
  19613. Result.getValue(0), Result.getValue(1));
  19614. Results.push_back(Pair);
  19615. Results.push_back(Result.getValue(2)); // Chain
  19616. return;
  19617. }
  19618. }
  19619. }
  19620. bool AArch64TargetLowering::useLoadStackGuardNode() const {
  19621. if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
  19622. return TargetLowering::useLoadStackGuardNode();
  19623. return true;
  19624. }
  19625. unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
  19626. // Combine multiple FDIVs with the same divisor into multiple FMULs by the
  19627. // reciprocal if there are three or more FDIVs.
  19628. return 3;
  19629. }
  19630. TargetLoweringBase::LegalizeTypeAction
  19631. AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
  19632. // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
  19633. // v4i16, v2i32 instead of to promote.
  19634. if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
  19635. VT == MVT::v1f32)
  19636. return TypeWidenVector;
  19637. return TargetLoweringBase::getPreferredVectorAction(VT);
  19638. }
  19639. // In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
  19640. // provided the address is 16-byte aligned.
  19641. bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const {
  19642. if (!Subtarget->hasLSE2())
  19643. return false;
  19644. if (auto LI = dyn_cast<LoadInst>(I))
  19645. return LI->getType()->getPrimitiveSizeInBits() == 128 &&
  19646. LI->getAlign() >= Align(16);
  19647. if (auto SI = dyn_cast<StoreInst>(I))
  19648. return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
  19649. SI->getAlign() >= Align(16);
  19650. return false;
  19651. }
  19652. bool AArch64TargetLowering::shouldInsertFencesForAtomic(
  19653. const Instruction *I) const {
  19654. return isOpSuitableForLDPSTP(I);
  19655. }
  19656. bool AArch64TargetLowering::shouldInsertTrailingFenceForAtomicStore(
  19657. const Instruction *I) const {
  19658. // Store-Release instructions only provide seq_cst guarantees when paired with
  19659. // Load-Acquire instructions. MSVC CRT does not use these instructions to
  19660. // implement seq_cst loads and stores, so we need additional explicit fences
  19661. // after memory writes.
  19662. if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
  19663. return false;
  19664. switch (I->getOpcode()) {
  19665. default:
  19666. return false;
  19667. case Instruction::AtomicCmpXchg:
  19668. return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
  19669. AtomicOrdering::SequentiallyConsistent;
  19670. case Instruction::AtomicRMW:
  19671. return cast<AtomicRMWInst>(I)->getOrdering() ==
  19672. AtomicOrdering::SequentiallyConsistent;
  19673. case Instruction::Store:
  19674. return cast<StoreInst>(I)->getOrdering() ==
  19675. AtomicOrdering::SequentiallyConsistent;
  19676. }
  19677. }
  19678. // Loads and stores less than 128-bits are already atomic; ones above that
  19679. // are doomed anyway, so defer to the default libcall and blame the OS when
  19680. // things go wrong.
  19681. TargetLoweringBase::AtomicExpansionKind
  19682. AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
  19683. unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
  19684. if (Size != 128 || isOpSuitableForLDPSTP(SI))
  19685. return AtomicExpansionKind::None;
  19686. return AtomicExpansionKind::Expand;
  19687. }
  19688. // Loads and stores less than 128-bits are already atomic; ones above that
  19689. // are doomed anyway, so defer to the default libcall and blame the OS when
  19690. // things go wrong.
  19691. TargetLowering::AtomicExpansionKind
  19692. AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
  19693. unsigned Size = LI->getType()->getPrimitiveSizeInBits();
  19694. if (Size != 128 || isOpSuitableForLDPSTP(LI))
  19695. return AtomicExpansionKind::None;
  19696. // At -O0, fast-regalloc cannot cope with the live vregs necessary to
  19697. // implement atomicrmw without spilling. If the target address is also on the
  19698. // stack and close enough to the spill slot, this can lead to a situation
  19699. // where the monitor always gets cleared and the atomic operation can never
  19700. // succeed. So at -O0 lower this operation to a CAS loop.
  19701. if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
  19702. return AtomicExpansionKind::CmpXChg;
  19703. // Using CAS for an atomic load has a better chance of succeeding under high
  19704. // contention situations. So use it if available.
  19705. return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
  19706. : AtomicExpansionKind::LLSC;
  19707. }
  19708. // For the real atomic operations, we have ldxr/stxr up to 128 bits,
  19709. TargetLowering::AtomicExpansionKind
  19710. AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
  19711. if (AI->isFloatingPointOperation())
  19712. return AtomicExpansionKind::CmpXChg;
  19713. unsigned Size = AI->getType()->getPrimitiveSizeInBits();
  19714. if (Size > 128) return AtomicExpansionKind::None;
  19715. // Nand is not supported in LSE.
  19716. // Leave 128 bits to LLSC or CmpXChg.
  19717. if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
  19718. if (Subtarget->hasLSE())
  19719. return AtomicExpansionKind::None;
  19720. if (Subtarget->outlineAtomics()) {
  19721. // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
  19722. // Don't outline them unless
  19723. // (1) high level <atomic> support approved:
  19724. // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
  19725. // (2) low level libgcc and compiler-rt support implemented by:
  19726. // min/max outline atomics helpers
  19727. if (AI->getOperation() != AtomicRMWInst::Min &&
  19728. AI->getOperation() != AtomicRMWInst::Max &&
  19729. AI->getOperation() != AtomicRMWInst::UMin &&
  19730. AI->getOperation() != AtomicRMWInst::UMax) {
  19731. return AtomicExpansionKind::None;
  19732. }
  19733. }
  19734. }
  19735. // At -O0, fast-regalloc cannot cope with the live vregs necessary to
  19736. // implement atomicrmw without spilling. If the target address is also on the
  19737. // stack and close enough to the spill slot, this can lead to a situation
  19738. // where the monitor always gets cleared and the atomic operation can never
  19739. // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
  19740. // we have a single CAS instruction that can replace the loop.
  19741. if (getTargetMachine().getOptLevel() == CodeGenOpt::None ||
  19742. Subtarget->hasLSE())
  19743. return AtomicExpansionKind::CmpXChg;
  19744. return AtomicExpansionKind::LLSC;
  19745. }
  19746. TargetLowering::AtomicExpansionKind
  19747. AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
  19748. AtomicCmpXchgInst *AI) const {
  19749. // If subtarget has LSE, leave cmpxchg intact for codegen.
  19750. if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
  19751. return AtomicExpansionKind::None;
  19752. // At -O0, fast-regalloc cannot cope with the live vregs necessary to
  19753. // implement cmpxchg without spilling. If the address being exchanged is also
  19754. // on the stack and close enough to the spill slot, this can lead to a
  19755. // situation where the monitor always gets cleared and the atomic operation
  19756. // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
  19757. if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
  19758. return AtomicExpansionKind::None;
  19759. // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
  19760. // it.
  19761. unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
  19762. if (Size > 64)
  19763. return AtomicExpansionKind::None;
  19764. return AtomicExpansionKind::LLSC;
  19765. }
  19766. Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
  19767. Type *ValueTy, Value *Addr,
  19768. AtomicOrdering Ord) const {
  19769. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  19770. bool IsAcquire = isAcquireOrStronger(Ord);
  19771. // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
  19772. // intrinsic must return {i64, i64} and we have to recombine them into a
  19773. // single i128 here.
  19774. if (ValueTy->getPrimitiveSizeInBits() == 128) {
  19775. Intrinsic::ID Int =
  19776. IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
  19777. Function *Ldxr = Intrinsic::getDeclaration(M, Int);
  19778. Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
  19779. Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
  19780. Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
  19781. Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
  19782. Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
  19783. Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
  19784. return Builder.CreateOr(
  19785. Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
  19786. }
  19787. Type *Tys[] = { Addr->getType() };
  19788. Intrinsic::ID Int =
  19789. IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
  19790. Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
  19791. const DataLayout &DL = M->getDataLayout();
  19792. IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
  19793. CallInst *CI = Builder.CreateCall(Ldxr, Addr);
  19794. CI->addParamAttr(
  19795. 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
  19796. Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
  19797. return Builder.CreateBitCast(Trunc, ValueTy);
  19798. }
  19799. void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
  19800. IRBuilderBase &Builder) const {
  19801. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  19802. Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
  19803. }
  19804. Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
  19805. Value *Val, Value *Addr,
  19806. AtomicOrdering Ord) const {
  19807. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  19808. bool IsRelease = isReleaseOrStronger(Ord);
  19809. // Since the intrinsics must have legal type, the i128 intrinsics take two
  19810. // parameters: "i64, i64". We must marshal Val into the appropriate form
  19811. // before the call.
  19812. if (Val->getType()->getPrimitiveSizeInBits() == 128) {
  19813. Intrinsic::ID Int =
  19814. IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
  19815. Function *Stxr = Intrinsic::getDeclaration(M, Int);
  19816. Type *Int64Ty = Type::getInt64Ty(M->getContext());
  19817. Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
  19818. Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
  19819. Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
  19820. return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
  19821. }
  19822. Intrinsic::ID Int =
  19823. IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
  19824. Type *Tys[] = { Addr->getType() };
  19825. Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
  19826. const DataLayout &DL = M->getDataLayout();
  19827. IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
  19828. Val = Builder.CreateBitCast(Val, IntValTy);
  19829. CallInst *CI = Builder.CreateCall(
  19830. Stxr, {Builder.CreateZExtOrBitCast(
  19831. Val, Stxr->getFunctionType()->getParamType(0)),
  19832. Addr});
  19833. CI->addParamAttr(1, Attribute::get(Builder.getContext(),
  19834. Attribute::ElementType, Val->getType()));
  19835. return CI;
  19836. }
  19837. bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
  19838. Type *Ty, CallingConv::ID CallConv, bool isVarArg,
  19839. const DataLayout &DL) const {
  19840. if (!Ty->isArrayTy()) {
  19841. const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
  19842. return TySize.isScalable() && TySize.getKnownMinValue() > 128;
  19843. }
  19844. // All non aggregate members of the type must have the same type
  19845. SmallVector<EVT> ValueVTs;
  19846. ComputeValueVTs(*this, DL, Ty, ValueVTs);
  19847. return all_equal(ValueVTs);
  19848. }
  19849. bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
  19850. EVT) const {
  19851. return false;
  19852. }
  19853. static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
  19854. Module *M = IRB.GetInsertBlock()->getParent()->getParent();
  19855. Function *ThreadPointerFunc =
  19856. Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
  19857. return IRB.CreatePointerCast(
  19858. IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
  19859. Offset),
  19860. IRB.getInt8PtrTy()->getPointerTo(0));
  19861. }
  19862. Value *AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
  19863. // Android provides a fixed TLS slot for the stack cookie. See the definition
  19864. // of TLS_SLOT_STACK_GUARD in
  19865. // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
  19866. if (Subtarget->isTargetAndroid())
  19867. return UseTlsOffset(IRB, 0x28);
  19868. // Fuchsia is similar.
  19869. // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
  19870. if (Subtarget->isTargetFuchsia())
  19871. return UseTlsOffset(IRB, -0x10);
  19872. return TargetLowering::getIRStackGuard(IRB);
  19873. }
  19874. void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
  19875. // MSVC CRT provides functionalities for stack protection.
  19876. if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
  19877. // MSVC CRT has a global variable holding security cookie.
  19878. M.getOrInsertGlobal("__security_cookie",
  19879. Type::getInt8PtrTy(M.getContext()));
  19880. // MSVC CRT has a function to validate security cookie.
  19881. FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
  19882. Subtarget->getSecurityCheckCookieName(),
  19883. Type::getVoidTy(M.getContext()), Type::getInt8PtrTy(M.getContext()));
  19884. if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
  19885. F->setCallingConv(CallingConv::Win64);
  19886. F->addParamAttr(0, Attribute::AttrKind::InReg);
  19887. }
  19888. return;
  19889. }
  19890. TargetLowering::insertSSPDeclarations(M);
  19891. }
  19892. Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
  19893. // MSVC CRT has a global variable holding security cookie.
  19894. if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
  19895. return M.getGlobalVariable("__security_cookie");
  19896. return TargetLowering::getSDagStackGuard(M);
  19897. }
  19898. Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
  19899. // MSVC CRT has a function to validate security cookie.
  19900. if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
  19901. return M.getFunction(Subtarget->getSecurityCheckCookieName());
  19902. return TargetLowering::getSSPStackGuardCheck(M);
  19903. }
  19904. Value *
  19905. AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
  19906. // Android provides a fixed TLS slot for the SafeStack pointer. See the
  19907. // definition of TLS_SLOT_SAFESTACK in
  19908. // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
  19909. if (Subtarget->isTargetAndroid())
  19910. return UseTlsOffset(IRB, 0x48);
  19911. // Fuchsia is similar.
  19912. // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
  19913. if (Subtarget->isTargetFuchsia())
  19914. return UseTlsOffset(IRB, -0x8);
  19915. return TargetLowering::getSafeStackPointerLocation(IRB);
  19916. }
  19917. bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
  19918. const Instruction &AndI) const {
  19919. // Only sink 'and' mask to cmp use block if it is masking a single bit, since
  19920. // this is likely to be fold the and/cmp/br into a single tbz instruction. It
  19921. // may be beneficial to sink in other cases, but we would have to check that
  19922. // the cmp would not get folded into the br to form a cbz for these to be
  19923. // beneficial.
  19924. ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
  19925. if (!Mask)
  19926. return false;
  19927. return Mask->getValue().isPowerOf2();
  19928. }
  19929. bool AArch64TargetLowering::
  19930. shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
  19931. SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
  19932. unsigned OldShiftOpcode, unsigned NewShiftOpcode,
  19933. SelectionDAG &DAG) const {
  19934. // Does baseline recommend not to perform the fold by default?
  19935. if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
  19936. X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
  19937. return false;
  19938. // Else, if this is a vector shift, prefer 'shl'.
  19939. return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
  19940. }
  19941. TargetLowering::ShiftLegalizationStrategy
  19942. AArch64TargetLowering::preferredShiftLegalizationStrategy(
  19943. SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
  19944. if (DAG.getMachineFunction().getFunction().hasMinSize() &&
  19945. !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
  19946. return ShiftLegalizationStrategy::LowerToLibcall;
  19947. return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
  19948. ExpansionFactor);
  19949. }
  19950. void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
  19951. // Update IsSplitCSR in AArch64unctionInfo.
  19952. AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
  19953. AFI->setIsSplitCSR(true);
  19954. }
  19955. void AArch64TargetLowering::insertCopiesSplitCSR(
  19956. MachineBasicBlock *Entry,
  19957. const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
  19958. const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
  19959. const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
  19960. if (!IStart)
  19961. return;
  19962. const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  19963. MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
  19964. MachineBasicBlock::iterator MBBI = Entry->begin();
  19965. for (const MCPhysReg *I = IStart; *I; ++I) {
  19966. const TargetRegisterClass *RC = nullptr;
  19967. if (AArch64::GPR64RegClass.contains(*I))
  19968. RC = &AArch64::GPR64RegClass;
  19969. else if (AArch64::FPR64RegClass.contains(*I))
  19970. RC = &AArch64::FPR64RegClass;
  19971. else
  19972. llvm_unreachable("Unexpected register class in CSRsViaCopy!");
  19973. Register NewVR = MRI->createVirtualRegister(RC);
  19974. // Create copy from CSR to a virtual register.
  19975. // FIXME: this currently does not emit CFI pseudo-instructions, it works
  19976. // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
  19977. // nounwind. If we want to generalize this later, we may need to emit
  19978. // CFI pseudo-instructions.
  19979. assert(Entry->getParent()->getFunction().hasFnAttribute(
  19980. Attribute::NoUnwind) &&
  19981. "Function should be nounwind in insertCopiesSplitCSR!");
  19982. Entry->addLiveIn(*I);
  19983. BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
  19984. .addReg(*I);
  19985. // Insert the copy-back instructions right before the terminator.
  19986. for (auto *Exit : Exits)
  19987. BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
  19988. TII->get(TargetOpcode::COPY), *I)
  19989. .addReg(NewVR);
  19990. }
  19991. }
  19992. bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
  19993. // Integer division on AArch64 is expensive. However, when aggressively
  19994. // optimizing for code size, we prefer to use a div instruction, as it is
  19995. // usually smaller than the alternative sequence.
  19996. // The exception to this is vector division. Since AArch64 doesn't have vector
  19997. // integer division, leaving the division as-is is a loss even in terms of
  19998. // size, because it will have to be scalarized, while the alternative code
  19999. // sequence can be performed in vector form.
  20000. bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
  20001. return OptSize && !VT.isVector();
  20002. }
  20003. bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
  20004. // We want inc-of-add for scalars and sub-of-not for vectors.
  20005. return VT.isScalarInteger();
  20006. }
  20007. bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
  20008. EVT VT) const {
  20009. // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
  20010. // legalize.
  20011. if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
  20012. return false;
  20013. return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
  20014. }
  20015. bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
  20016. return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
  20017. }
  20018. unsigned
  20019. AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
  20020. if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
  20021. return getPointerTy(DL).getSizeInBits();
  20022. return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
  20023. }
  20024. void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
  20025. MachineFrameInfo &MFI = MF.getFrameInfo();
  20026. // If we have any vulnerable SVE stack objects then the stack protector
  20027. // needs to be placed at the top of the SVE stack area, as the SVE locals
  20028. // are placed above the other locals, so we allocate it as if it were a
  20029. // scalable vector.
  20030. // FIXME: It may be worthwhile having a specific interface for this rather
  20031. // than doing it here in finalizeLowering.
  20032. if (MFI.hasStackProtectorIndex()) {
  20033. for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
  20034. if (MFI.getStackID(i) == TargetStackID::ScalableVector &&
  20035. MFI.getObjectSSPLayout(i) != MachineFrameInfo::SSPLK_None) {
  20036. MFI.setStackID(MFI.getStackProtectorIndex(),
  20037. TargetStackID::ScalableVector);
  20038. MFI.setObjectAlignment(MFI.getStackProtectorIndex(), Align(16));
  20039. break;
  20040. }
  20041. }
  20042. }
  20043. MFI.computeMaxCallFrameSize(MF);
  20044. TargetLoweringBase::finalizeLowering(MF);
  20045. }
  20046. // Unlike X86, we let frame lowering assign offsets to all catch objects.
  20047. bool AArch64TargetLowering::needsFixedCatchObjects() const {
  20048. return false;
  20049. }
  20050. bool AArch64TargetLowering::shouldLocalize(
  20051. const MachineInstr &MI, const TargetTransformInfo *TTI) const {
  20052. auto &MF = *MI.getMF();
  20053. auto &MRI = MF.getRegInfo();
  20054. auto maxUses = [](unsigned RematCost) {
  20055. // A cost of 1 means remats are basically free.
  20056. if (RematCost == 1)
  20057. return std::numeric_limits<unsigned>::max();
  20058. if (RematCost == 2)
  20059. return 2U;
  20060. // Remat is too expensive, only sink if there's one user.
  20061. if (RematCost > 2)
  20062. return 1U;
  20063. llvm_unreachable("Unexpected remat cost");
  20064. };
  20065. switch (MI.getOpcode()) {
  20066. case TargetOpcode::G_GLOBAL_VALUE: {
  20067. // On Darwin, TLS global vars get selected into function calls, which
  20068. // we don't want localized, as they can get moved into the middle of a
  20069. // another call sequence.
  20070. const GlobalValue &GV = *MI.getOperand(1).getGlobal();
  20071. if (GV.isThreadLocal() && Subtarget->isTargetMachO())
  20072. return false;
  20073. break;
  20074. }
  20075. case TargetOpcode::G_CONSTANT: {
  20076. auto *CI = MI.getOperand(1).getCImm();
  20077. APInt Imm = CI->getValue();
  20078. InstructionCost Cost = TTI->getIntImmCost(
  20079. Imm, CI->getType(), TargetTransformInfo::TCK_CodeSize);
  20080. assert(Cost.isValid() && "Expected a valid imm cost");
  20081. unsigned RematCost = *Cost.getValue();
  20082. Register Reg = MI.getOperand(0).getReg();
  20083. unsigned MaxUses = maxUses(RematCost);
  20084. // Don't pass UINT_MAX sentinal value to hasAtMostUserInstrs().
  20085. if (MaxUses == std::numeric_limits<unsigned>::max())
  20086. --MaxUses;
  20087. return MRI.hasAtMostUserInstrs(Reg, MaxUses);
  20088. }
  20089. // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
  20090. // localizable.
  20091. case AArch64::ADRP:
  20092. case AArch64::G_ADD_LOW:
  20093. return true;
  20094. default:
  20095. break;
  20096. }
  20097. return TargetLoweringBase::shouldLocalize(MI, TTI);
  20098. }
  20099. bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
  20100. if (isa<ScalableVectorType>(Inst.getType()))
  20101. return true;
  20102. for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
  20103. if (isa<ScalableVectorType>(Inst.getOperand(i)->getType()))
  20104. return true;
  20105. if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
  20106. if (isa<ScalableVectorType>(AI->getAllocatedType()))
  20107. return true;
  20108. }
  20109. // Checks to allow the use of SME instructions
  20110. if (auto *Base = dyn_cast<CallBase>(&Inst)) {
  20111. auto CallerAttrs = SMEAttrs(*Inst.getFunction());
  20112. auto CalleeAttrs = SMEAttrs(*Base);
  20113. if (CallerAttrs.requiresSMChange(CalleeAttrs,
  20114. /*BodyOverridesInterface=*/false) ||
  20115. CallerAttrs.requiresLazySave(CalleeAttrs))
  20116. return true;
  20117. }
  20118. return false;
  20119. }
  20120. // Return the largest legal scalable vector type that matches VT's element type.
  20121. static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
  20122. assert(VT.isFixedLengthVector() &&
  20123. DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
  20124. "Expected legal fixed length vector!");
  20125. switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
  20126. default:
  20127. llvm_unreachable("unexpected element type for SVE container");
  20128. case MVT::i8:
  20129. return EVT(MVT::nxv16i8);
  20130. case MVT::i16:
  20131. return EVT(MVT::nxv8i16);
  20132. case MVT::i32:
  20133. return EVT(MVT::nxv4i32);
  20134. case MVT::i64:
  20135. return EVT(MVT::nxv2i64);
  20136. case MVT::f16:
  20137. return EVT(MVT::nxv8f16);
  20138. case MVT::f32:
  20139. return EVT(MVT::nxv4f32);
  20140. case MVT::f64:
  20141. return EVT(MVT::nxv2f64);
  20142. }
  20143. }
  20144. // Return a PTRUE with active lanes corresponding to the extent of VT.
  20145. static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
  20146. EVT VT) {
  20147. assert(VT.isFixedLengthVector() &&
  20148. DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
  20149. "Expected legal fixed length vector!");
  20150. std::optional<unsigned> PgPattern =
  20151. getSVEPredPatternFromNumElements(VT.getVectorNumElements());
  20152. assert(PgPattern && "Unexpected element count for SVE predicate");
  20153. // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
  20154. // AArch64SVEPredPattern::all, which can enable the use of unpredicated
  20155. // variants of instructions when available.
  20156. const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
  20157. unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
  20158. unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
  20159. if (MaxSVESize && MinSVESize == MaxSVESize &&
  20160. MaxSVESize == VT.getSizeInBits())
  20161. PgPattern = AArch64SVEPredPattern::all;
  20162. MVT MaskVT;
  20163. switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
  20164. default:
  20165. llvm_unreachable("unexpected element type for SVE predicate");
  20166. case MVT::i8:
  20167. MaskVT = MVT::nxv16i1;
  20168. break;
  20169. case MVT::i16:
  20170. case MVT::f16:
  20171. MaskVT = MVT::nxv8i1;
  20172. break;
  20173. case MVT::i32:
  20174. case MVT::f32:
  20175. MaskVT = MVT::nxv4i1;
  20176. break;
  20177. case MVT::i64:
  20178. case MVT::f64:
  20179. MaskVT = MVT::nxv2i1;
  20180. break;
  20181. }
  20182. return getPTrue(DAG, DL, MaskVT, *PgPattern);
  20183. }
  20184. static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
  20185. EVT VT) {
  20186. assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
  20187. "Expected legal scalable vector!");
  20188. auto PredTy = VT.changeVectorElementType(MVT::i1);
  20189. return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
  20190. }
  20191. static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
  20192. if (VT.isFixedLengthVector())
  20193. return getPredicateForFixedLengthVector(DAG, DL, VT);
  20194. return getPredicateForScalableVector(DAG, DL, VT);
  20195. }
  20196. // Grow V to consume an entire SVE register.
  20197. static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
  20198. assert(VT.isScalableVector() &&
  20199. "Expected to convert into a scalable vector!");
  20200. assert(V.getValueType().isFixedLengthVector() &&
  20201. "Expected a fixed length vector operand!");
  20202. SDLoc DL(V);
  20203. SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
  20204. return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
  20205. }
  20206. // Shrink V so it's just big enough to maintain a VT's worth of data.
  20207. static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
  20208. assert(VT.isFixedLengthVector() &&
  20209. "Expected to convert into a fixed length vector!");
  20210. assert(V.getValueType().isScalableVector() &&
  20211. "Expected a scalable vector operand!");
  20212. SDLoc DL(V);
  20213. SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
  20214. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
  20215. }
  20216. // Convert all fixed length vector loads larger than NEON to masked_loads.
  20217. SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
  20218. SDValue Op, SelectionDAG &DAG) const {
  20219. auto Load = cast<LoadSDNode>(Op);
  20220. SDLoc DL(Op);
  20221. EVT VT = Op.getValueType();
  20222. EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
  20223. EVT LoadVT = ContainerVT;
  20224. EVT MemVT = Load->getMemoryVT();
  20225. auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
  20226. if (VT.isFloatingPoint()) {
  20227. LoadVT = ContainerVT.changeTypeToInteger();
  20228. MemVT = MemVT.changeTypeToInteger();
  20229. }
  20230. SDValue NewLoad = DAG.getMaskedLoad(
  20231. LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
  20232. DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
  20233. Load->getAddressingMode(), Load->getExtensionType());
  20234. SDValue Result = NewLoad;
  20235. if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
  20236. EVT ExtendVT = ContainerVT.changeVectorElementType(
  20237. Load->getMemoryVT().getVectorElementType());
  20238. Result = getSVESafeBitCast(ExtendVT, Result, DAG);
  20239. Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
  20240. Pg, Result, DAG.getUNDEF(ContainerVT));
  20241. } else if (VT.isFloatingPoint()) {
  20242. Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
  20243. }
  20244. Result = convertFromScalableVector(DAG, VT, Result);
  20245. SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
  20246. return DAG.getMergeValues(MergedValues, DL);
  20247. }
  20248. static SDValue convertFixedMaskToScalableVector(SDValue Mask,
  20249. SelectionDAG &DAG) {
  20250. SDLoc DL(Mask);
  20251. EVT InVT = Mask.getValueType();
  20252. EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
  20253. auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
  20254. if (ISD::isBuildVectorAllOnes(Mask.getNode()))
  20255. return Pg;
  20256. auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
  20257. auto Op2 = DAG.getConstant(0, DL, ContainerVT);
  20258. return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(),
  20259. {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
  20260. }
  20261. // Convert all fixed length vector loads larger than NEON to masked_loads.
  20262. SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
  20263. SDValue Op, SelectionDAG &DAG) const {
  20264. auto Load = cast<MaskedLoadSDNode>(Op);
  20265. SDLoc DL(Op);
  20266. EVT VT = Op.getValueType();
  20267. EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
  20268. SDValue Mask = convertFixedMaskToScalableVector(Load->getMask(), DAG);
  20269. SDValue PassThru;
  20270. bool IsPassThruZeroOrUndef = false;
  20271. if (Load->getPassThru()->isUndef()) {
  20272. PassThru = DAG.getUNDEF(ContainerVT);
  20273. IsPassThruZeroOrUndef = true;
  20274. } else {
  20275. if (ContainerVT.isInteger())
  20276. PassThru = DAG.getConstant(0, DL, ContainerVT);
  20277. else
  20278. PassThru = DAG.getConstantFP(0, DL, ContainerVT);
  20279. if (isZerosVector(Load->getPassThru().getNode()))
  20280. IsPassThruZeroOrUndef = true;
  20281. }
  20282. SDValue NewLoad = DAG.getMaskedLoad(
  20283. ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
  20284. Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
  20285. Load->getAddressingMode(), Load->getExtensionType());
  20286. SDValue Result = NewLoad;
  20287. if (!IsPassThruZeroOrUndef) {
  20288. SDValue OldPassThru =
  20289. convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
  20290. Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
  20291. }
  20292. Result = convertFromScalableVector(DAG, VT, Result);
  20293. SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
  20294. return DAG.getMergeValues(MergedValues, DL);
  20295. }
  20296. // Convert all fixed length vector stores larger than NEON to masked_stores.
  20297. SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
  20298. SDValue Op, SelectionDAG &DAG) const {
  20299. auto Store = cast<StoreSDNode>(Op);
  20300. SDLoc DL(Op);
  20301. EVT VT = Store->getValue().getValueType();
  20302. EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
  20303. EVT MemVT = Store->getMemoryVT();
  20304. auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
  20305. auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
  20306. if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
  20307. EVT TruncVT = ContainerVT.changeVectorElementType(
  20308. Store->getMemoryVT().getVectorElementType());
  20309. MemVT = MemVT.changeTypeToInteger();
  20310. NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
  20311. NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
  20312. DAG.getUNDEF(TruncVT));
  20313. NewValue =
  20314. getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
  20315. } else if (VT.isFloatingPoint()) {
  20316. MemVT = MemVT.changeTypeToInteger();
  20317. NewValue =
  20318. getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
  20319. }
  20320. return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
  20321. Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
  20322. Store->getMemOperand(), Store->getAddressingMode(),
  20323. Store->isTruncatingStore());
  20324. }
  20325. SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
  20326. SDValue Op, SelectionDAG &DAG) const {
  20327. auto *Store = cast<MaskedStoreSDNode>(Op);
  20328. SDLoc DL(Op);
  20329. EVT VT = Store->getValue().getValueType();
  20330. EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
  20331. auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
  20332. SDValue Mask = convertFixedMaskToScalableVector(Store->getMask(), DAG);
  20333. return DAG.getMaskedStore(
  20334. Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
  20335. Mask, Store->getMemoryVT(), Store->getMemOperand(),
  20336. Store->getAddressingMode(), Store->isTruncatingStore());
  20337. }
  20338. SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
  20339. SDValue Op, SelectionDAG &DAG) const {
  20340. SDLoc dl(Op);
  20341. EVT VT = Op.getValueType();
  20342. EVT EltVT = VT.getVectorElementType();
  20343. bool Signed = Op.getOpcode() == ISD::SDIV;
  20344. unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
  20345. bool Negated;
  20346. uint64_t SplatVal;
  20347. if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
  20348. EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
  20349. SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
  20350. SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
  20351. SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
  20352. SDValue Res =
  20353. DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
  20354. if (Negated)
  20355. Res = DAG.getNode(ISD::SUB, dl, ContainerVT,
  20356. DAG.getConstant(0, dl, ContainerVT), Res);
  20357. return convertFromScalableVector(DAG, VT, Res);
  20358. }
  20359. // Scalable vector i32/i64 DIV is supported.
  20360. if (EltVT == MVT::i32 || EltVT == MVT::i64)
  20361. return LowerToPredicatedOp(Op, DAG, PredOpcode);
  20362. // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
  20363. EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
  20364. EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
  20365. unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
  20366. // If the wider type is legal: extend, op, and truncate.
  20367. EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
  20368. if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
  20369. SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0));
  20370. SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1));
  20371. SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1);
  20372. return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
  20373. }
  20374. auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
  20375. &ExtendOpcode](SDValue Op) {
  20376. SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64);
  20377. SDValue IdxHalf =
  20378. DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
  20379. SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero);
  20380. SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf);
  20381. return std::pair<SDValue, SDValue>(
  20382. {DAG.getNode(ExtendOpcode, dl, PromVT, Lo),
  20383. DAG.getNode(ExtendOpcode, dl, PromVT, Hi)});
  20384. };
  20385. // If wider type is not legal: split, extend, op, trunc and concat.
  20386. auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
  20387. auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
  20388. SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt);
  20389. SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt);
  20390. SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo);
  20391. SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi);
  20392. return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc});
  20393. }
  20394. SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
  20395. SDValue Op, SelectionDAG &DAG) const {
  20396. EVT VT = Op.getValueType();
  20397. assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
  20398. SDLoc DL(Op);
  20399. SDValue Val = Op.getOperand(0);
  20400. EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
  20401. Val = convertToScalableVector(DAG, ContainerVT, Val);
  20402. bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
  20403. unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
  20404. // Repeatedly unpack Val until the result is of the desired element type.
  20405. switch (ContainerVT.getSimpleVT().SimpleTy) {
  20406. default:
  20407. llvm_unreachable("unimplemented container type");
  20408. case MVT::nxv16i8:
  20409. Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
  20410. if (VT.getVectorElementType() == MVT::i16)
  20411. break;
  20412. [[fallthrough]];
  20413. case MVT::nxv8i16:
  20414. Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
  20415. if (VT.getVectorElementType() == MVT::i32)
  20416. break;
  20417. [[fallthrough]];
  20418. case MVT::nxv4i32:
  20419. Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
  20420. assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
  20421. break;
  20422. }
  20423. return convertFromScalableVector(DAG, VT, Val);
  20424. }
  20425. SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
  20426. SDValue Op, SelectionDAG &DAG) const {
  20427. EVT VT = Op.getValueType();
  20428. assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
  20429. SDLoc DL(Op);
  20430. SDValue Val = Op.getOperand(0);
  20431. EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
  20432. Val = convertToScalableVector(DAG, ContainerVT, Val);
  20433. // Repeatedly truncate Val until the result is of the desired element type.
  20434. switch (ContainerVT.getSimpleVT().SimpleTy) {
  20435. default:
  20436. llvm_unreachable("unimplemented container type");
  20437. case MVT::nxv2i64:
  20438. Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
  20439. Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
  20440. if (VT.getVectorElementType() == MVT::i32)
  20441. break;
  20442. [[fallthrough]];
  20443. case MVT::nxv4i32:
  20444. Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
  20445. Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
  20446. if (VT.getVectorElementType() == MVT::i16)
  20447. break;
  20448. [[fallthrough]];
  20449. case MVT::nxv8i16:
  20450. Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
  20451. Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
  20452. assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
  20453. break;
  20454. }
  20455. return convertFromScalableVector(DAG, VT, Val);
  20456. }
  20457. SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
  20458. SDValue Op, SelectionDAG &DAG) const {
  20459. EVT VT = Op.getValueType();
  20460. EVT InVT = Op.getOperand(0).getValueType();
  20461. assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
  20462. SDLoc DL(Op);
  20463. EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
  20464. SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
  20465. return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
  20466. }
  20467. SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
  20468. SDValue Op, SelectionDAG &DAG) const {
  20469. EVT VT = Op.getValueType();
  20470. assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
  20471. SDLoc DL(Op);
  20472. EVT InVT = Op.getOperand(0).getValueType();
  20473. EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
  20474. SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
  20475. auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
  20476. Op.getOperand(1), Op.getOperand(2));
  20477. return convertFromScalableVector(DAG, VT, ScalableRes);
  20478. }
  20479. // Convert vector operation 'Op' to an equivalent predicated operation whereby
  20480. // the original operation's type is used to construct a suitable predicate.
  20481. // NOTE: The results for inactive lanes are undefined.
  20482. SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
  20483. SelectionDAG &DAG,
  20484. unsigned NewOp) const {
  20485. EVT VT = Op.getValueType();
  20486. SDLoc DL(Op);
  20487. auto Pg = getPredicateForVector(DAG, DL, VT);
  20488. if (VT.isFixedLengthVector()) {
  20489. assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
  20490. EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
  20491. // Create list of operands by converting existing ones to scalable types.
  20492. SmallVector<SDValue, 4> Operands = {Pg};
  20493. for (const SDValue &V : Op->op_values()) {
  20494. if (isa<CondCodeSDNode>(V)) {
  20495. Operands.push_back(V);
  20496. continue;
  20497. }
  20498. if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
  20499. EVT VTArg = VTNode->getVT().getVectorElementType();
  20500. EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
  20501. Operands.push_back(DAG.getValueType(NewVTArg));
  20502. continue;
  20503. }
  20504. assert(isTypeLegal(V.getValueType()) &&
  20505. "Expected only legal fixed-width types");
  20506. Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
  20507. }
  20508. if (isMergePassthruOpcode(NewOp))
  20509. Operands.push_back(DAG.getUNDEF(ContainerVT));
  20510. auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
  20511. return convertFromScalableVector(DAG, VT, ScalableRes);
  20512. }
  20513. assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
  20514. SmallVector<SDValue, 4> Operands = {Pg};
  20515. for (const SDValue &V : Op->op_values()) {
  20516. assert((!V.getValueType().isVector() ||
  20517. V.getValueType().isScalableVector()) &&
  20518. "Only scalable vectors are supported!");
  20519. Operands.push_back(V);
  20520. }
  20521. if (isMergePassthruOpcode(NewOp))
  20522. Operands.push_back(DAG.getUNDEF(VT));
  20523. return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
  20524. }
  20525. // If a fixed length vector operation has no side effects when applied to
  20526. // undefined elements, we can safely use scalable vectors to perform the same
  20527. // operation without needing to worry about predication.
  20528. SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
  20529. SelectionDAG &DAG) const {
  20530. EVT VT = Op.getValueType();
  20531. assert(VT.isFixedLengthVector() && isTypeLegal(VT) &&
  20532. "Only expected to lower fixed length vector operation!");
  20533. EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
  20534. // Create list of operands by converting existing ones to scalable types.
  20535. SmallVector<SDValue, 4> Ops;
  20536. for (const SDValue &V : Op->op_values()) {
  20537. assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
  20538. // Pass through non-vector operands.
  20539. if (!V.getValueType().isVector()) {
  20540. Ops.push_back(V);
  20541. continue;
  20542. }
  20543. // "cast" fixed length vector to a scalable vector.
  20544. assert(V.getValueType().isFixedLengthVector() &&
  20545. isTypeLegal(V.getValueType()) &&
  20546. "Only fixed length vectors are supported!");
  20547. Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
  20548. }
  20549. auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
  20550. return convertFromScalableVector(DAG, VT, ScalableRes);
  20551. }
  20552. SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
  20553. SelectionDAG &DAG) const {
  20554. SDLoc DL(ScalarOp);
  20555. SDValue AccOp = ScalarOp.getOperand(0);
  20556. SDValue VecOp = ScalarOp.getOperand(1);
  20557. EVT SrcVT = VecOp.getValueType();
  20558. EVT ResVT = SrcVT.getVectorElementType();
  20559. EVT ContainerVT = SrcVT;
  20560. if (SrcVT.isFixedLengthVector()) {
  20561. ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
  20562. VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
  20563. }
  20564. SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
  20565. SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
  20566. // Convert operands to Scalable.
  20567. AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
  20568. DAG.getUNDEF(ContainerVT), AccOp, Zero);
  20569. // Perform reduction.
  20570. SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
  20571. Pg, AccOp, VecOp);
  20572. return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
  20573. }
  20574. SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
  20575. SelectionDAG &DAG) const {
  20576. SDLoc DL(ReduceOp);
  20577. SDValue Op = ReduceOp.getOperand(0);
  20578. EVT OpVT = Op.getValueType();
  20579. EVT VT = ReduceOp.getValueType();
  20580. if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
  20581. return SDValue();
  20582. SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
  20583. switch (ReduceOp.getOpcode()) {
  20584. default:
  20585. return SDValue();
  20586. case ISD::VECREDUCE_OR:
  20587. if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
  20588. // The predicate can be 'Op' because
  20589. // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
  20590. return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
  20591. else
  20592. return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
  20593. case ISD::VECREDUCE_AND: {
  20594. Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
  20595. return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
  20596. }
  20597. case ISD::VECREDUCE_XOR: {
  20598. SDValue ID =
  20599. DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
  20600. if (OpVT == MVT::nxv1i1) {
  20601. // Emulate a CNTP on .Q using .D and a different governing predicate.
  20602. Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
  20603. Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
  20604. }
  20605. SDValue Cntp =
  20606. DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
  20607. return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
  20608. }
  20609. }
  20610. return SDValue();
  20611. }
  20612. SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
  20613. SDValue ScalarOp,
  20614. SelectionDAG &DAG) const {
  20615. SDLoc DL(ScalarOp);
  20616. SDValue VecOp = ScalarOp.getOperand(0);
  20617. EVT SrcVT = VecOp.getValueType();
  20618. if (useSVEForFixedLengthVectorVT(
  20619. SrcVT,
  20620. /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
  20621. EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
  20622. VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
  20623. }
  20624. // UADDV always returns an i64 result.
  20625. EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
  20626. SrcVT.getVectorElementType();
  20627. EVT RdxVT = SrcVT;
  20628. if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
  20629. RdxVT = getPackedSVEVectorVT(ResVT);
  20630. SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
  20631. SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
  20632. SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
  20633. Rdx, DAG.getConstant(0, DL, MVT::i64));
  20634. // The VEC_REDUCE nodes expect an element size result.
  20635. if (ResVT != ScalarOp.getValueType())
  20636. Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
  20637. return Res;
  20638. }
  20639. SDValue
  20640. AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
  20641. SelectionDAG &DAG) const {
  20642. EVT VT = Op.getValueType();
  20643. SDLoc DL(Op);
  20644. EVT InVT = Op.getOperand(1).getValueType();
  20645. EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
  20646. SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
  20647. SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
  20648. // Convert the mask to a predicated (NOTE: We don't need to worry about
  20649. // inactive lanes since VSELECT is safe when given undefined elements).
  20650. EVT MaskVT = Op.getOperand(0).getValueType();
  20651. EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
  20652. auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
  20653. Mask = DAG.getNode(ISD::TRUNCATE, DL,
  20654. MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
  20655. auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
  20656. Mask, Op1, Op2);
  20657. return convertFromScalableVector(DAG, VT, ScalableRes);
  20658. }
  20659. SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
  20660. SDValue Op, SelectionDAG &DAG) const {
  20661. SDLoc DL(Op);
  20662. EVT InVT = Op.getOperand(0).getValueType();
  20663. EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
  20664. assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
  20665. "Only expected to lower fixed length vector operation!");
  20666. assert(Op.getValueType() == InVT.changeTypeToInteger() &&
  20667. "Expected integer result of the same bit length as the inputs!");
  20668. auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
  20669. auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
  20670. auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
  20671. EVT CmpVT = Pg.getValueType();
  20672. auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
  20673. {Pg, Op1, Op2, Op.getOperand(2)});
  20674. EVT PromoteVT = ContainerVT.changeTypeToInteger();
  20675. auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
  20676. return convertFromScalableVector(DAG, Op.getValueType(), Promote);
  20677. }
  20678. SDValue
  20679. AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
  20680. SelectionDAG &DAG) const {
  20681. SDLoc DL(Op);
  20682. auto SrcOp = Op.getOperand(0);
  20683. EVT VT = Op.getValueType();
  20684. EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
  20685. EVT ContainerSrcVT =
  20686. getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
  20687. SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
  20688. Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
  20689. return convertFromScalableVector(DAG, VT, Op);
  20690. }
  20691. SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
  20692. SDValue Op, SelectionDAG &DAG) const {
  20693. SDLoc DL(Op);
  20694. unsigned NumOperands = Op->getNumOperands();
  20695. assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
  20696. "Unexpected number of operands in CONCAT_VECTORS");
  20697. auto SrcOp1 = Op.getOperand(0);
  20698. auto SrcOp2 = Op.getOperand(1);
  20699. EVT VT = Op.getValueType();
  20700. EVT SrcVT = SrcOp1.getValueType();
  20701. if (NumOperands > 2) {
  20702. SmallVector<SDValue, 4> Ops;
  20703. EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
  20704. for (unsigned I = 0; I < NumOperands; I += 2)
  20705. Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
  20706. Op->getOperand(I), Op->getOperand(I + 1)));
  20707. return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
  20708. }
  20709. EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
  20710. SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
  20711. SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
  20712. SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
  20713. Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
  20714. return convertFromScalableVector(DAG, VT, Op);
  20715. }
  20716. SDValue
  20717. AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
  20718. SelectionDAG &DAG) const {
  20719. EVT VT = Op.getValueType();
  20720. assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
  20721. SDLoc DL(Op);
  20722. SDValue Val = Op.getOperand(0);
  20723. SDValue Pg = getPredicateForVector(DAG, DL, VT);
  20724. EVT SrcVT = Val.getValueType();
  20725. EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
  20726. EVT ExtendVT = ContainerVT.changeVectorElementType(
  20727. SrcVT.getVectorElementType());
  20728. Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
  20729. Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
  20730. Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
  20731. Val = getSVESafeBitCast(ExtendVT, Val, DAG);
  20732. Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
  20733. Pg, Val, DAG.getUNDEF(ContainerVT));
  20734. return convertFromScalableVector(DAG, VT, Val);
  20735. }
  20736. SDValue
  20737. AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
  20738. SelectionDAG &DAG) const {
  20739. EVT VT = Op.getValueType();
  20740. assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
  20741. SDLoc DL(Op);
  20742. SDValue Val = Op.getOperand(0);
  20743. EVT SrcVT = Val.getValueType();
  20744. EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
  20745. EVT RoundVT = ContainerSrcVT.changeVectorElementType(
  20746. VT.getVectorElementType());
  20747. SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
  20748. Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
  20749. Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
  20750. Op.getOperand(1), DAG.getUNDEF(RoundVT));
  20751. Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
  20752. Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
  20753. Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
  20754. return DAG.getNode(ISD::BITCAST, DL, VT, Val);
  20755. }
  20756. SDValue
  20757. AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
  20758. SelectionDAG &DAG) const {
  20759. EVT VT = Op.getValueType();
  20760. assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
  20761. bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
  20762. unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
  20763. : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
  20764. SDLoc DL(Op);
  20765. SDValue Val = Op.getOperand(0);
  20766. EVT SrcVT = Val.getValueType();
  20767. EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
  20768. EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
  20769. if (VT.bitsGE(SrcVT)) {
  20770. SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
  20771. Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
  20772. VT.changeTypeToInteger(), Val);
  20773. // Safe to use a larger than specified operand because by promoting the
  20774. // value nothing has changed from an arithmetic point of view.
  20775. Val =
  20776. convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
  20777. Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
  20778. DAG.getUNDEF(ContainerDstVT));
  20779. return convertFromScalableVector(DAG, VT, Val);
  20780. } else {
  20781. EVT CvtVT = ContainerSrcVT.changeVectorElementType(
  20782. ContainerDstVT.getVectorElementType());
  20783. SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
  20784. Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
  20785. Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
  20786. Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
  20787. Val = convertFromScalableVector(DAG, SrcVT, Val);
  20788. Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
  20789. return DAG.getNode(ISD::BITCAST, DL, VT, Val);
  20790. }
  20791. }
  20792. SDValue
  20793. AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
  20794. SelectionDAG &DAG) const {
  20795. EVT VT = Op.getValueType();
  20796. assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
  20797. bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
  20798. unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
  20799. : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
  20800. SDLoc DL(Op);
  20801. SDValue Val = Op.getOperand(0);
  20802. EVT SrcVT = Val.getValueType();
  20803. EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
  20804. EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
  20805. if (VT.bitsGT(SrcVT)) {
  20806. EVT CvtVT = ContainerDstVT.changeVectorElementType(
  20807. ContainerSrcVT.getVectorElementType());
  20808. SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
  20809. Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
  20810. Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
  20811. Val = convertToScalableVector(DAG, ContainerDstVT, Val);
  20812. Val = getSVESafeBitCast(CvtVT, Val, DAG);
  20813. Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
  20814. DAG.getUNDEF(ContainerDstVT));
  20815. return convertFromScalableVector(DAG, VT, Val);
  20816. } else {
  20817. EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
  20818. SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
  20819. // Safe to use a larger than specified result since an fp_to_int where the
  20820. // result doesn't fit into the destination is undefined.
  20821. Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
  20822. Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
  20823. Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
  20824. return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
  20825. }
  20826. }
  20827. SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
  20828. SDValue Op, SelectionDAG &DAG) const {
  20829. EVT VT = Op.getValueType();
  20830. assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
  20831. auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
  20832. auto ShuffleMask = SVN->getMask();
  20833. SDLoc DL(Op);
  20834. SDValue Op1 = Op.getOperand(0);
  20835. SDValue Op2 = Op.getOperand(1);
  20836. EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
  20837. Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
  20838. Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
  20839. auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
  20840. if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
  20841. return MVT::i32;
  20842. return ScalarTy;
  20843. };
  20844. if (SVN->isSplat()) {
  20845. unsigned Lane = std::max(0, SVN->getSplatIndex());
  20846. EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
  20847. SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
  20848. DAG.getConstant(Lane, DL, MVT::i64));
  20849. Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
  20850. return convertFromScalableVector(DAG, VT, Op);
  20851. }
  20852. bool ReverseEXT = false;
  20853. unsigned Imm;
  20854. if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
  20855. Imm == VT.getVectorNumElements() - 1) {
  20856. if (ReverseEXT)
  20857. std::swap(Op1, Op2);
  20858. EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
  20859. SDValue Scalar = DAG.getNode(
  20860. ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
  20861. DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
  20862. Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
  20863. return convertFromScalableVector(DAG, VT, Op);
  20864. }
  20865. for (unsigned LaneSize : {64U, 32U, 16U}) {
  20866. if (isREVMask(ShuffleMask, VT, LaneSize)) {
  20867. EVT NewVT =
  20868. getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), LaneSize));
  20869. unsigned RevOp;
  20870. unsigned EltSz = VT.getScalarSizeInBits();
  20871. if (EltSz == 8)
  20872. RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
  20873. else if (EltSz == 16)
  20874. RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
  20875. else
  20876. RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
  20877. Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
  20878. Op = LowerToPredicatedOp(Op, DAG, RevOp);
  20879. Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
  20880. return convertFromScalableVector(DAG, VT, Op);
  20881. }
  20882. }
  20883. if (Subtarget->hasSVE2p1() && VT.getScalarSizeInBits() == 64 &&
  20884. isREVMask(ShuffleMask, VT, 128)) {
  20885. if (!VT.isFloatingPoint())
  20886. return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
  20887. EVT NewVT = getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), 64));
  20888. Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
  20889. Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
  20890. Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
  20891. return convertFromScalableVector(DAG, VT, Op);
  20892. }
  20893. unsigned WhichResult;
  20894. if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
  20895. return convertFromScalableVector(
  20896. DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
  20897. if (isTRNMask(ShuffleMask, VT, WhichResult)) {
  20898. unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
  20899. return convertFromScalableVector(
  20900. DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
  20901. }
  20902. if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
  20903. return convertFromScalableVector(
  20904. DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
  20905. if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
  20906. unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
  20907. return convertFromScalableVector(
  20908. DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
  20909. }
  20910. // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
  20911. // represents the same logical operation as performed by a ZIP instruction. In
  20912. // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
  20913. // equivalent to an AArch64 instruction. There's the extra component of
  20914. // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
  20915. // only operated on 64/128bit vector types that have a direct mapping to a
  20916. // target register and so an exact mapping is implied.
  20917. // However, when using SVE for fixed length vectors, most legal vector types
  20918. // are actually sub-vectors of a larger SVE register. When mapping
  20919. // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
  20920. // how the mask's indices translate. Specifically, when the mapping requires
  20921. // an exact meaning for a specific vector index (e.g. Index X is the last
  20922. // vector element in the register) then such mappings are often only safe when
  20923. // the exact SVE register size is know. The main exception to this is when
  20924. // indices are logically relative to the first element of either
  20925. // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
  20926. // when converting from fixed-length to scalable vector types (i.e. the start
  20927. // of a fixed length vector is always the start of a scalable vector).
  20928. unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
  20929. unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
  20930. if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
  20931. if (ShuffleVectorInst::isReverseMask(ShuffleMask) && Op2.isUndef()) {
  20932. Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
  20933. return convertFromScalableVector(DAG, VT, Op);
  20934. }
  20935. if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
  20936. return convertFromScalableVector(
  20937. DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
  20938. if (isUZPMask(ShuffleMask, VT, WhichResult)) {
  20939. unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
  20940. return convertFromScalableVector(
  20941. DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
  20942. }
  20943. if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
  20944. return convertFromScalableVector(
  20945. DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
  20946. if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
  20947. unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
  20948. return convertFromScalableVector(
  20949. DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
  20950. }
  20951. }
  20952. return SDValue();
  20953. }
  20954. SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
  20955. SelectionDAG &DAG) const {
  20956. SDLoc DL(Op);
  20957. EVT InVT = Op.getValueType();
  20958. assert(VT.isScalableVector() && isTypeLegal(VT) &&
  20959. InVT.isScalableVector() && isTypeLegal(InVT) &&
  20960. "Only expect to cast between legal scalable vector types!");
  20961. assert(VT.getVectorElementType() != MVT::i1 &&
  20962. InVT.getVectorElementType() != MVT::i1 &&
  20963. "For predicate bitcasts, use getSVEPredicateBitCast");
  20964. if (InVT == VT)
  20965. return Op;
  20966. EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
  20967. EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
  20968. // Safe bitcasting between unpacked vector types of different element counts
  20969. // is currently unsupported because the following is missing the necessary
  20970. // work to ensure the result's elements live where they're supposed to within
  20971. // an SVE register.
  20972. // 01234567
  20973. // e.g. nxv2i32 = XX??XX??
  20974. // nxv4f16 = X?X?X?X?
  20975. assert((VT.getVectorElementCount() == InVT.getVectorElementCount() ||
  20976. VT == PackedVT || InVT == PackedInVT) &&
  20977. "Unexpected bitcast!");
  20978. // Pack input if required.
  20979. if (InVT != PackedInVT)
  20980. Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
  20981. Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
  20982. // Unpack result if required.
  20983. if (VT != PackedVT)
  20984. Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
  20985. return Op;
  20986. }
  20987. bool AArch64TargetLowering::isAllActivePredicate(SelectionDAG &DAG,
  20988. SDValue N) const {
  20989. return ::isAllActivePredicate(DAG, N);
  20990. }
  20991. EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const {
  20992. return ::getPromotedVTForPredicate(VT);
  20993. }
  20994. bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
  20995. SDValue Op, const APInt &OriginalDemandedBits,
  20996. const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
  20997. unsigned Depth) const {
  20998. unsigned Opc = Op.getOpcode();
  20999. switch (Opc) {
  21000. case AArch64ISD::VSHL: {
  21001. // Match (VSHL (VLSHR Val X) X)
  21002. SDValue ShiftL = Op;
  21003. SDValue ShiftR = Op->getOperand(0);
  21004. if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
  21005. return false;
  21006. if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
  21007. return false;
  21008. unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
  21009. unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
  21010. // Other cases can be handled as well, but this is not
  21011. // implemented.
  21012. if (ShiftRBits != ShiftLBits)
  21013. return false;
  21014. unsigned ScalarSize = Op.getScalarValueSizeInBits();
  21015. assert(ScalarSize > ShiftLBits && "Invalid shift imm");
  21016. APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
  21017. APInt UnusedBits = ~OriginalDemandedBits;
  21018. if ((ZeroBits & UnusedBits) != ZeroBits)
  21019. return false;
  21020. // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
  21021. // used - simplify to just Val.
  21022. return TLO.CombineTo(Op, ShiftR->getOperand(0));
  21023. }
  21024. case ISD::INTRINSIC_WO_CHAIN: {
  21025. if (auto ElementSize = IsSVECntIntrinsic(Op)) {
  21026. unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
  21027. if (!MaxSVEVectorSizeInBits)
  21028. MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
  21029. unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
  21030. // The SVE count intrinsics don't support the multiplier immediate so we
  21031. // don't have to account for that here. The value returned may be slightly
  21032. // over the true required bits, as this is based on the "ALL" pattern. The
  21033. // other patterns are also exposed by these intrinsics, but they all
  21034. // return a value that's strictly less than "ALL".
  21035. unsigned RequiredBits = llvm::bit_width(MaxElements);
  21036. unsigned BitWidth = Known.Zero.getBitWidth();
  21037. if (RequiredBits < BitWidth)
  21038. Known.Zero.setHighBits(BitWidth - RequiredBits);
  21039. return false;
  21040. }
  21041. }
  21042. }
  21043. return TargetLowering::SimplifyDemandedBitsForTargetNode(
  21044. Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
  21045. }
  21046. bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
  21047. return Op.getOpcode() == AArch64ISD::DUP ||
  21048. Op.getOpcode() == AArch64ISD::MOVI ||
  21049. (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
  21050. Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
  21051. TargetLowering::isTargetCanonicalConstantNode(Op);
  21052. }
  21053. bool AArch64TargetLowering::isConstantUnsignedBitfieldExtractLegal(
  21054. unsigned Opc, LLT Ty1, LLT Ty2) const {
  21055. return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64));
  21056. }
  21057. bool AArch64TargetLowering::isComplexDeinterleavingSupported() const {
  21058. return Subtarget->hasComplxNum();
  21059. }
  21060. bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
  21061. ComplexDeinterleavingOperation Operation, Type *Ty) const {
  21062. auto *VTy = dyn_cast<FixedVectorType>(Ty);
  21063. if (!VTy)
  21064. return false;
  21065. auto *ScalarTy = VTy->getScalarType();
  21066. unsigned NumElements = VTy->getNumElements();
  21067. unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
  21068. if ((VTyWidth < 128 && VTyWidth != 64) || !llvm::isPowerOf2_32(VTyWidth))
  21069. return false;
  21070. return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
  21071. ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
  21072. }
  21073. Value *AArch64TargetLowering::createComplexDeinterleavingIR(
  21074. Instruction *I, ComplexDeinterleavingOperation OperationType,
  21075. ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
  21076. Value *Accumulator) const {
  21077. FixedVectorType *Ty = cast<FixedVectorType>(InputA->getType());
  21078. IRBuilder<> B(I);
  21079. unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
  21080. assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
  21081. "Vector type must be either 64 or a power of 2 that is at least 128");
  21082. if (TyWidth > 128) {
  21083. int Stride = Ty->getNumElements() / 2;
  21084. auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
  21085. auto SplitSeqVec = llvm::to_vector(SplitSeq);
  21086. ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
  21087. ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
  21088. auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
  21089. auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
  21090. auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
  21091. auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
  21092. Value *LowerSplitAcc = nullptr;
  21093. Value *UpperSplitAcc = nullptr;
  21094. if (Accumulator) {
  21095. LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
  21096. UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
  21097. }
  21098. auto *LowerSplitInt = createComplexDeinterleavingIR(
  21099. I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
  21100. auto *UpperSplitInt = createComplexDeinterleavingIR(
  21101. I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
  21102. ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
  21103. return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
  21104. }
  21105. if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
  21106. Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
  21107. Intrinsic::aarch64_neon_vcmla_rot90,
  21108. Intrinsic::aarch64_neon_vcmla_rot180,
  21109. Intrinsic::aarch64_neon_vcmla_rot270};
  21110. if (Accumulator == nullptr)
  21111. Accumulator = ConstantFP::get(Ty, 0);
  21112. return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
  21113. {Accumulator, InputB, InputA});
  21114. }
  21115. if (OperationType == ComplexDeinterleavingOperation::CAdd) {
  21116. Intrinsic::ID IntId = Intrinsic::not_intrinsic;
  21117. if (Rotation == ComplexDeinterleavingRotation::Rotation_90)
  21118. IntId = Intrinsic::aarch64_neon_vcadd_rot90;
  21119. else if (Rotation == ComplexDeinterleavingRotation::Rotation_270)
  21120. IntId = Intrinsic::aarch64_neon_vcadd_rot270;
  21121. if (IntId == Intrinsic::not_intrinsic)
  21122. return nullptr;
  21123. return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
  21124. }
  21125. return nullptr;
  21126. }