NVPTXISelLowering.cpp 214 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445
  1. //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file defines the interfaces that NVPTX uses to lower LLVM code into a
  10. // selection DAG.
  11. //
  12. //===----------------------------------------------------------------------===//
  13. #include "NVPTXISelLowering.h"
  14. #include "MCTargetDesc/NVPTXBaseInfo.h"
  15. #include "NVPTX.h"
  16. #include "NVPTXSubtarget.h"
  17. #include "NVPTXTargetMachine.h"
  18. #include "NVPTXTargetObjectFile.h"
  19. #include "NVPTXUtilities.h"
  20. #include "llvm/ADT/APInt.h"
  21. #include "llvm/ADT/STLExtras.h"
  22. #include "llvm/ADT/SmallVector.h"
  23. #include "llvm/ADT/StringRef.h"
  24. #include "llvm/CodeGen/Analysis.h"
  25. #include "llvm/CodeGen/MachineFunction.h"
  26. #include "llvm/CodeGen/MachineMemOperand.h"
  27. #include "llvm/CodeGen/SelectionDAG.h"
  28. #include "llvm/CodeGen/SelectionDAGNodes.h"
  29. #include "llvm/CodeGen/TargetCallingConv.h"
  30. #include "llvm/CodeGen/TargetLowering.h"
  31. #include "llvm/CodeGen/ValueTypes.h"
  32. #include "llvm/IR/Argument.h"
  33. #include "llvm/IR/Attributes.h"
  34. #include "llvm/IR/Constants.h"
  35. #include "llvm/IR/DataLayout.h"
  36. #include "llvm/IR/DerivedTypes.h"
  37. #include "llvm/IR/FPEnv.h"
  38. #include "llvm/IR/Function.h"
  39. #include "llvm/IR/GlobalValue.h"
  40. #include "llvm/IR/Instruction.h"
  41. #include "llvm/IR/Instructions.h"
  42. #include "llvm/IR/IntrinsicsNVPTX.h"
  43. #include "llvm/IR/Module.h"
  44. #include "llvm/IR/Type.h"
  45. #include "llvm/IR/Value.h"
  46. #include "llvm/Support/Casting.h"
  47. #include "llvm/Support/CodeGen.h"
  48. #include "llvm/Support/CommandLine.h"
  49. #include "llvm/Support/ErrorHandling.h"
  50. #include "llvm/Support/MachineValueType.h"
  51. #include "llvm/Support/raw_ostream.h"
  52. #include "llvm/Target/TargetMachine.h"
  53. #include "llvm/Target/TargetOptions.h"
  54. #include <algorithm>
  55. #include <cassert>
  56. #include <cmath>
  57. #include <cstdint>
  58. #include <iterator>
  59. #include <sstream>
  60. #include <string>
  61. #include <utility>
  62. #include <vector>
  63. #define DEBUG_TYPE "nvptx-lower"
  64. using namespace llvm;
  65. static std::atomic<unsigned> GlobalUniqueCallSite;
  66. static cl::opt<bool> sched4reg(
  67. "nvptx-sched4reg",
  68. cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
  69. static cl::opt<unsigned> FMAContractLevelOpt(
  70. "nvptx-fma-level", cl::Hidden,
  71. cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
  72. " 1: do it 2: do it aggressively"),
  73. cl::init(2));
  74. static cl::opt<int> UsePrecDivF32(
  75. "nvptx-prec-divf32", cl::Hidden,
  76. cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
  77. " IEEE Compliant F32 div.rnd if available."),
  78. cl::init(2));
  79. static cl::opt<bool> UsePrecSqrtF32(
  80. "nvptx-prec-sqrtf32", cl::Hidden,
  81. cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
  82. cl::init(true));
  83. int NVPTXTargetLowering::getDivF32Level() const {
  84. if (UsePrecDivF32.getNumOccurrences() > 0) {
  85. // If nvptx-prec-div32=N is used on the command-line, always honor it
  86. return UsePrecDivF32;
  87. } else {
  88. // Otherwise, use div.approx if fast math is enabled
  89. if (getTargetMachine().Options.UnsafeFPMath)
  90. return 0;
  91. else
  92. return 2;
  93. }
  94. }
  95. bool NVPTXTargetLowering::usePrecSqrtF32() const {
  96. if (UsePrecSqrtF32.getNumOccurrences() > 0) {
  97. // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
  98. return UsePrecSqrtF32;
  99. } else {
  100. // Otherwise, use sqrt.approx if fast math is enabled
  101. return !getTargetMachine().Options.UnsafeFPMath;
  102. }
  103. }
  104. bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const {
  105. return MF.getDenormalMode(APFloat::IEEEsingle()).Output ==
  106. DenormalMode::PreserveSign;
  107. }
  108. static bool IsPTXVectorType(MVT VT) {
  109. switch (VT.SimpleTy) {
  110. default:
  111. return false;
  112. case MVT::v2i1:
  113. case MVT::v4i1:
  114. case MVT::v2i8:
  115. case MVT::v4i8:
  116. case MVT::v2i16:
  117. case MVT::v4i16:
  118. case MVT::v2i32:
  119. case MVT::v4i32:
  120. case MVT::v2i64:
  121. case MVT::v2f16:
  122. case MVT::v4f16:
  123. case MVT::v8f16: // <4 x f16x2>
  124. case MVT::v2bf16:
  125. case MVT::v4bf16:
  126. case MVT::v8bf16: // <4 x bf16x2>
  127. case MVT::v2f32:
  128. case MVT::v4f32:
  129. case MVT::v2f64:
  130. return true;
  131. }
  132. }
  133. /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
  134. /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
  135. /// into their primitive components.
  136. /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
  137. /// same number of types as the Ins/Outs arrays in LowerFormalArguments,
  138. /// LowerCall, and LowerReturn.
  139. static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
  140. Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
  141. SmallVectorImpl<uint64_t> *Offsets = nullptr,
  142. uint64_t StartingOffset = 0) {
  143. SmallVector<EVT, 16> TempVTs;
  144. SmallVector<uint64_t, 16> TempOffsets;
  145. // Special case for i128 - decompose to (i64, i64)
  146. if (Ty->isIntegerTy(128)) {
  147. ValueVTs.push_back(EVT(MVT::i64));
  148. ValueVTs.push_back(EVT(MVT::i64));
  149. if (Offsets) {
  150. Offsets->push_back(StartingOffset + 0);
  151. Offsets->push_back(StartingOffset + 8);
  152. }
  153. return;
  154. }
  155. // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
  156. if (StructType *STy = dyn_cast<StructType>(Ty)) {
  157. auto const *SL = DL.getStructLayout(STy);
  158. auto ElementNum = 0;
  159. for(auto *EI : STy->elements()) {
  160. ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
  161. StartingOffset + SL->getElementOffset(ElementNum));
  162. ++ElementNum;
  163. }
  164. return;
  165. }
  166. ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
  167. for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
  168. EVT VT = TempVTs[i];
  169. uint64_t Off = TempOffsets[i];
  170. // Split vectors into individual elements, except for v2f16, which
  171. // we will pass as a single scalar.
  172. if (VT.isVector()) {
  173. unsigned NumElts = VT.getVectorNumElements();
  174. EVT EltVT = VT.getVectorElementType();
  175. // Vectors with an even number of f16 elements will be passed to
  176. // us as an array of v2f16/v2bf16 elements. We must match this so we
  177. // stay in sync with Ins/Outs.
  178. if ((EltVT == MVT::f16 || EltVT == MVT::bf16) && NumElts % 2 == 0) {
  179. EltVT = EltVT == MVT::f16 ? MVT::v2f16 : MVT::v2bf16;
  180. NumElts /= 2;
  181. }
  182. for (unsigned j = 0; j != NumElts; ++j) {
  183. ValueVTs.push_back(EltVT);
  184. if (Offsets)
  185. Offsets->push_back(Off + j * EltVT.getStoreSize());
  186. }
  187. } else {
  188. ValueVTs.push_back(VT);
  189. if (Offsets)
  190. Offsets->push_back(Off);
  191. }
  192. }
  193. }
  194. /// PromoteScalarIntegerPTX
  195. /// Used to make sure the arguments/returns are suitable for passing
  196. /// and promote them to a larger size if they're not.
  197. ///
  198. /// The promoted type is placed in \p PromoteVT if the function returns true.
  199. static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) {
  200. if (VT.isScalarInteger()) {
  201. switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
  202. default:
  203. llvm_unreachable(
  204. "Promotion is not suitable for scalars of size larger than 64-bits");
  205. case 1:
  206. *PromotedVT = MVT::i1;
  207. break;
  208. case 2:
  209. case 4:
  210. case 8:
  211. *PromotedVT = MVT::i8;
  212. break;
  213. case 16:
  214. *PromotedVT = MVT::i16;
  215. break;
  216. case 32:
  217. *PromotedVT = MVT::i32;
  218. break;
  219. case 64:
  220. *PromotedVT = MVT::i64;
  221. break;
  222. }
  223. return EVT(*PromotedVT) != VT;
  224. }
  225. return false;
  226. }
  227. // Check whether we can merge loads/stores of some of the pieces of a
  228. // flattened function parameter or return value into a single vector
  229. // load/store.
  230. //
  231. // The flattened parameter is represented as a list of EVTs and
  232. // offsets, and the whole structure is aligned to ParamAlignment. This
  233. // function determines whether we can load/store pieces of the
  234. // parameter starting at index Idx using a single vectorized op of
  235. // size AccessSize. If so, it returns the number of param pieces
  236. // covered by the vector op. Otherwise, it returns 1.
  237. static unsigned CanMergeParamLoadStoresStartingAt(
  238. unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
  239. const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
  240. // Can't vectorize if param alignment is not sufficient.
  241. if (ParamAlignment < AccessSize)
  242. return 1;
  243. // Can't vectorize if offset is not aligned.
  244. if (Offsets[Idx] & (AccessSize - 1))
  245. return 1;
  246. EVT EltVT = ValueVTs[Idx];
  247. unsigned EltSize = EltVT.getStoreSize();
  248. // Element is too large to vectorize.
  249. if (EltSize >= AccessSize)
  250. return 1;
  251. unsigned NumElts = AccessSize / EltSize;
  252. // Can't vectorize if AccessBytes if not a multiple of EltSize.
  253. if (AccessSize != EltSize * NumElts)
  254. return 1;
  255. // We don't have enough elements to vectorize.
  256. if (Idx + NumElts > ValueVTs.size())
  257. return 1;
  258. // PTX ISA can only deal with 2- and 4-element vector ops.
  259. if (NumElts != 4 && NumElts != 2)
  260. return 1;
  261. for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
  262. // Types do not match.
  263. if (ValueVTs[j] != EltVT)
  264. return 1;
  265. // Elements are not contiguous.
  266. if (Offsets[j] - Offsets[j - 1] != EltSize)
  267. return 1;
  268. }
  269. // OK. We can vectorize ValueVTs[i..i+NumElts)
  270. return NumElts;
  271. }
  272. // Flags for tracking per-element vectorization state of loads/stores
  273. // of a flattened function parameter or return value.
  274. enum ParamVectorizationFlags {
  275. PVF_INNER = 0x0, // Middle elements of a vector.
  276. PVF_FIRST = 0x1, // First element of the vector.
  277. PVF_LAST = 0x2, // Last element of the vector.
  278. // Scalar is effectively a 1-element vector.
  279. PVF_SCALAR = PVF_FIRST | PVF_LAST
  280. };
  281. // Computes whether and how we can vectorize the loads/stores of a
  282. // flattened function parameter or return value.
  283. //
  284. // The flattened parameter is represented as the list of ValueVTs and
  285. // Offsets, and is aligned to ParamAlignment bytes. We return a vector
  286. // of the same size as ValueVTs indicating how each piece should be
  287. // loaded/stored (i.e. as a scalar, or as part of a vector
  288. // load/store).
  289. static SmallVector<ParamVectorizationFlags, 16>
  290. VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
  291. const SmallVectorImpl<uint64_t> &Offsets,
  292. Align ParamAlignment, bool IsVAArg = false) {
  293. // Set vector size to match ValueVTs and mark all elements as
  294. // scalars by default.
  295. SmallVector<ParamVectorizationFlags, 16> VectorInfo;
  296. VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
  297. if (IsVAArg)
  298. return VectorInfo;
  299. // Check what we can vectorize using 128/64/32-bit accesses.
  300. for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
  301. // Skip elements we've already processed.
  302. assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
  303. for (unsigned AccessSize : {16, 8, 4, 2}) {
  304. unsigned NumElts = CanMergeParamLoadStoresStartingAt(
  305. I, AccessSize, ValueVTs, Offsets, ParamAlignment);
  306. // Mark vectorized elements.
  307. switch (NumElts) {
  308. default:
  309. llvm_unreachable("Unexpected return value");
  310. case 1:
  311. // Can't vectorize using this size, try next smaller size.
  312. continue;
  313. case 2:
  314. assert(I + 1 < E && "Not enough elements.");
  315. VectorInfo[I] = PVF_FIRST;
  316. VectorInfo[I + 1] = PVF_LAST;
  317. I += 1;
  318. break;
  319. case 4:
  320. assert(I + 3 < E && "Not enough elements.");
  321. VectorInfo[I] = PVF_FIRST;
  322. VectorInfo[I + 1] = PVF_INNER;
  323. VectorInfo[I + 2] = PVF_INNER;
  324. VectorInfo[I + 3] = PVF_LAST;
  325. I += 3;
  326. break;
  327. }
  328. // Break out of the inner loop because we've already succeeded
  329. // using largest possible AccessSize.
  330. break;
  331. }
  332. }
  333. return VectorInfo;
  334. }
  335. // NVPTXTargetLowering Constructor.
  336. NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
  337. const NVPTXSubtarget &STI)
  338. : TargetLowering(TM), nvTM(&TM), STI(STI) {
  339. // always lower memset, memcpy, and memmove intrinsics to load/store
  340. // instructions, rather
  341. // then generating calls to memset, mempcy or memmove.
  342. MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
  343. MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
  344. MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
  345. setBooleanContents(ZeroOrNegativeOneBooleanContent);
  346. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
  347. // Jump is Expensive. Don't create extra control flow for 'and', 'or'
  348. // condition branches.
  349. setJumpIsExpensive(true);
  350. // Wide divides are _very_ slow. Try to reduce the width of the divide if
  351. // possible.
  352. addBypassSlowDiv(64, 32);
  353. // By default, use the Source scheduling
  354. if (sched4reg)
  355. setSchedulingPreference(Sched::RegPressure);
  356. else
  357. setSchedulingPreference(Sched::Source);
  358. auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
  359. LegalizeAction NoF16Action) {
  360. setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
  361. };
  362. addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
  363. addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
  364. addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
  365. addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
  366. addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
  367. addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
  368. addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
  369. addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
  370. addRegisterClass(MVT::bf16, &NVPTX::Float16RegsRegClass);
  371. addRegisterClass(MVT::v2bf16, &NVPTX::Float16x2RegsRegClass);
  372. // Conversion to/from FP16/FP16x2 is always legal.
  373. setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal);
  374. setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal);
  375. setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
  376. setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
  377. setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand);
  378. setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand);
  379. setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
  380. setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
  381. // Operations not directly supported by NVPTX.
  382. for (MVT VT : {MVT::f16, MVT::v2f16, MVT::f32, MVT::f64, MVT::i1, MVT::i8,
  383. MVT::i16, MVT::i32, MVT::i64}) {
  384. setOperationAction(ISD::SELECT_CC, VT, Expand);
  385. setOperationAction(ISD::BR_CC, VT, Expand);
  386. }
  387. // Some SIGN_EXTEND_INREG can be done using cvt instruction.
  388. // For others we will expand to a SHL/SRA pair.
  389. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
  390. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
  391. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
  392. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
  393. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
  394. setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom);
  395. setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom);
  396. setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom);
  397. setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom);
  398. setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom);
  399. setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom);
  400. setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
  401. setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
  402. // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs
  403. // that don't have h/w rotation we lower them to multi-instruction assembly.
  404. // See ROT*_sw in NVPTXIntrInfo.td
  405. setOperationAction(ISD::ROTL, MVT::i64, Legal);
  406. setOperationAction(ISD::ROTR, MVT::i64, Legal);
  407. setOperationAction(ISD::ROTL, MVT::i32, Legal);
  408. setOperationAction(ISD::ROTR, MVT::i32, Legal);
  409. setOperationAction(ISD::ROTL, MVT::i16, Expand);
  410. setOperationAction(ISD::ROTR, MVT::i16, Expand);
  411. setOperationAction(ISD::ROTL, MVT::i8, Expand);
  412. setOperationAction(ISD::ROTR, MVT::i8, Expand);
  413. setOperationAction(ISD::BSWAP, MVT::i16, Expand);
  414. setOperationAction(ISD::BSWAP, MVT::i32, Expand);
  415. setOperationAction(ISD::BSWAP, MVT::i64, Expand);
  416. // Indirect branch is not supported.
  417. // This also disables Jump Table creation.
  418. setOperationAction(ISD::BR_JT, MVT::Other, Expand);
  419. setOperationAction(ISD::BRIND, MVT::Other, Expand);
  420. setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
  421. setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
  422. // We want to legalize constant related memmove and memcopy
  423. // intrinsics.
  424. setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
  425. // Turn FP extload into load/fpextend
  426. setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
  427. setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
  428. setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
  429. setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
  430. setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
  431. setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
  432. setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
  433. setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
  434. setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
  435. // Turn FP truncstore into trunc + store.
  436. // FIXME: vector types should also be expanded
  437. setTruncStoreAction(MVT::f32, MVT::f16, Expand);
  438. setTruncStoreAction(MVT::f64, MVT::f16, Expand);
  439. setTruncStoreAction(MVT::f64, MVT::f32, Expand);
  440. // PTX does not support load / store predicate registers
  441. setOperationAction(ISD::LOAD, MVT::i1, Custom);
  442. setOperationAction(ISD::STORE, MVT::i1, Custom);
  443. for (MVT VT : MVT::integer_valuetypes()) {
  444. setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
  445. setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
  446. setTruncStoreAction(VT, MVT::i1, Expand);
  447. }
  448. // This is legal in NVPTX
  449. setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
  450. setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
  451. setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
  452. setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
  453. // TRAP can be lowered to PTX trap
  454. setOperationAction(ISD::TRAP, MVT::Other, Legal);
  455. // Register custom handling for vector loads/stores
  456. for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
  457. if (IsPTXVectorType(VT)) {
  458. setOperationAction(ISD::LOAD, VT, Custom);
  459. setOperationAction(ISD::STORE, VT, Custom);
  460. setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
  461. }
  462. }
  463. // Support varargs.
  464. setOperationAction(ISD::VASTART, MVT::Other, Custom);
  465. setOperationAction(ISD::VAARG, MVT::Other, Custom);
  466. setOperationAction(ISD::VACOPY, MVT::Other, Expand);
  467. setOperationAction(ISD::VAEND, MVT::Other, Expand);
  468. // Custom handling for i8 intrinsics
  469. setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
  470. for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
  471. setOperationAction(ISD::ABS, Ty, Legal);
  472. setOperationAction(ISD::SMIN, Ty, Legal);
  473. setOperationAction(ISD::SMAX, Ty, Legal);
  474. setOperationAction(ISD::UMIN, Ty, Legal);
  475. setOperationAction(ISD::UMAX, Ty, Legal);
  476. setOperationAction(ISD::CTPOP, Ty, Legal);
  477. setOperationAction(ISD::CTLZ, Ty, Legal);
  478. }
  479. setOperationAction(ISD::ADDC, MVT::i32, Legal);
  480. setOperationAction(ISD::ADDE, MVT::i32, Legal);
  481. setOperationAction(ISD::SUBC, MVT::i32, Legal);
  482. setOperationAction(ISD::SUBE, MVT::i32, Legal);
  483. if (STI.getPTXVersion() >= 43) {
  484. setOperationAction(ISD::ADDC, MVT::i64, Legal);
  485. setOperationAction(ISD::ADDE, MVT::i64, Legal);
  486. setOperationAction(ISD::SUBC, MVT::i64, Legal);
  487. setOperationAction(ISD::SUBE, MVT::i64, Legal);
  488. }
  489. setOperationAction(ISD::CTTZ, MVT::i16, Expand);
  490. setOperationAction(ISD::CTTZ, MVT::i32, Expand);
  491. setOperationAction(ISD::CTTZ, MVT::i64, Expand);
  492. // PTX does not directly support SELP of i1, so promote to i32 first
  493. setOperationAction(ISD::SELECT, MVT::i1, Custom);
  494. // PTX cannot multiply two i64s in a single instruction.
  495. setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
  496. setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
  497. // We have some custom DAG combine patterns for these nodes
  498. setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::FADD, ISD::MUL, ISD::SHL,
  499. ISD::SREM, ISD::UREM});
  500. // setcc for f16x2 needs special handling to prevent legalizer's
  501. // attempt to scalarize it due to v2i1 not being legal.
  502. if (STI.allowFP16Math())
  503. setTargetDAGCombine(ISD::SETCC);
  504. // Promote fp16 arithmetic if fp16 hardware isn't available or the
  505. // user passed --nvptx-no-fp16-math. The flag is useful because,
  506. // although sm_53+ GPUs have some sort of FP16 support in
  507. // hardware, only sm_53 and sm_60 have full implementation. Others
  508. // only have token amount of hardware and are likely to run faster
  509. // by using fp32 units instead.
  510. for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
  511. setFP16OperationAction(Op, MVT::f16, Legal, Promote);
  512. setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
  513. }
  514. // f16/f16x2 neg was introduced in PTX 60, SM_53.
  515. const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
  516. STI.getPTXVersion() >= 60 &&
  517. STI.allowFP16Math();
  518. for (const auto &VT : {MVT::f16, MVT::v2f16})
  519. setOperationAction(ISD::FNEG, VT,
  520. IsFP16FP16x2NegAvailable ? Legal : Expand);
  521. // (would be) Library functions.
  522. // These map to conversion instructions for scalar FP types.
  523. for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
  524. ISD::FROUNDEVEN, ISD::FTRUNC}) {
  525. setOperationAction(Op, MVT::f16, Legal);
  526. setOperationAction(Op, MVT::f32, Legal);
  527. setOperationAction(Op, MVT::f64, Legal);
  528. setOperationAction(Op, MVT::v2f16, Expand);
  529. }
  530. setOperationAction(ISD::FROUND, MVT::f16, Promote);
  531. setOperationAction(ISD::FROUND, MVT::v2f16, Expand);
  532. setOperationAction(ISD::FROUND, MVT::f32, Custom);
  533. setOperationAction(ISD::FROUND, MVT::f64, Custom);
  534. // 'Expand' implements FCOPYSIGN without calling an external library.
  535. setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
  536. setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);
  537. setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
  538. setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
  539. // These map to corresponding instructions for f32/f64. f16 must be
  540. // promoted to f32. v2f16 is expanded to f16, which is then promoted
  541. // to f32.
  542. for (const auto &Op :
  543. {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FABS}) {
  544. setOperationAction(Op, MVT::f16, Promote);
  545. setOperationAction(Op, MVT::f32, Legal);
  546. setOperationAction(Op, MVT::f64, Legal);
  547. setOperationAction(Op, MVT::v2f16, Expand);
  548. }
  549. // max.f16, max.f16x2 and max.NaN are supported on sm_80+.
  550. auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) {
  551. bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
  552. return IsAtLeastSm80 ? Legal : NotSm80Action;
  553. };
  554. for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
  555. setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote);
  556. setOperationAction(Op, MVT::f32, Legal);
  557. setOperationAction(Op, MVT::f64, Legal);
  558. setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
  559. }
  560. for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
  561. setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand);
  562. setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand));
  563. setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
  564. }
  565. // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
  566. // No FPOW or FREM in PTX.
  567. // Now deduce the information based on the above mentioned
  568. // actions
  569. computeRegisterProperties(STI.getRegisterInfo());
  570. setMinCmpXchgSizeInBits(32);
  571. }
  572. const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
  573. switch ((NVPTXISD::NodeType)Opcode) {
  574. case NVPTXISD::FIRST_NUMBER:
  575. break;
  576. case NVPTXISD::CALL:
  577. return "NVPTXISD::CALL";
  578. case NVPTXISD::RET_FLAG:
  579. return "NVPTXISD::RET_FLAG";
  580. case NVPTXISD::LOAD_PARAM:
  581. return "NVPTXISD::LOAD_PARAM";
  582. case NVPTXISD::Wrapper:
  583. return "NVPTXISD::Wrapper";
  584. case NVPTXISD::DeclareParam:
  585. return "NVPTXISD::DeclareParam";
  586. case NVPTXISD::DeclareScalarParam:
  587. return "NVPTXISD::DeclareScalarParam";
  588. case NVPTXISD::DeclareRet:
  589. return "NVPTXISD::DeclareRet";
  590. case NVPTXISD::DeclareScalarRet:
  591. return "NVPTXISD::DeclareScalarRet";
  592. case NVPTXISD::DeclareRetParam:
  593. return "NVPTXISD::DeclareRetParam";
  594. case NVPTXISD::PrintCall:
  595. return "NVPTXISD::PrintCall";
  596. case NVPTXISD::PrintConvergentCall:
  597. return "NVPTXISD::PrintConvergentCall";
  598. case NVPTXISD::PrintCallUni:
  599. return "NVPTXISD::PrintCallUni";
  600. case NVPTXISD::PrintConvergentCallUni:
  601. return "NVPTXISD::PrintConvergentCallUni";
  602. case NVPTXISD::LoadParam:
  603. return "NVPTXISD::LoadParam";
  604. case NVPTXISD::LoadParamV2:
  605. return "NVPTXISD::LoadParamV2";
  606. case NVPTXISD::LoadParamV4:
  607. return "NVPTXISD::LoadParamV4";
  608. case NVPTXISD::StoreParam:
  609. return "NVPTXISD::StoreParam";
  610. case NVPTXISD::StoreParamV2:
  611. return "NVPTXISD::StoreParamV2";
  612. case NVPTXISD::StoreParamV4:
  613. return "NVPTXISD::StoreParamV4";
  614. case NVPTXISD::StoreParamS32:
  615. return "NVPTXISD::StoreParamS32";
  616. case NVPTXISD::StoreParamU32:
  617. return "NVPTXISD::StoreParamU32";
  618. case NVPTXISD::CallArgBegin:
  619. return "NVPTXISD::CallArgBegin";
  620. case NVPTXISD::CallArg:
  621. return "NVPTXISD::CallArg";
  622. case NVPTXISD::LastCallArg:
  623. return "NVPTXISD::LastCallArg";
  624. case NVPTXISD::CallArgEnd:
  625. return "NVPTXISD::CallArgEnd";
  626. case NVPTXISD::CallVoid:
  627. return "NVPTXISD::CallVoid";
  628. case NVPTXISD::CallVal:
  629. return "NVPTXISD::CallVal";
  630. case NVPTXISD::CallSymbol:
  631. return "NVPTXISD::CallSymbol";
  632. case NVPTXISD::Prototype:
  633. return "NVPTXISD::Prototype";
  634. case NVPTXISD::MoveParam:
  635. return "NVPTXISD::MoveParam";
  636. case NVPTXISD::StoreRetval:
  637. return "NVPTXISD::StoreRetval";
  638. case NVPTXISD::StoreRetvalV2:
  639. return "NVPTXISD::StoreRetvalV2";
  640. case NVPTXISD::StoreRetvalV4:
  641. return "NVPTXISD::StoreRetvalV4";
  642. case NVPTXISD::PseudoUseParam:
  643. return "NVPTXISD::PseudoUseParam";
  644. case NVPTXISD::RETURN:
  645. return "NVPTXISD::RETURN";
  646. case NVPTXISD::CallSeqBegin:
  647. return "NVPTXISD::CallSeqBegin";
  648. case NVPTXISD::CallSeqEnd:
  649. return "NVPTXISD::CallSeqEnd";
  650. case NVPTXISD::CallPrototype:
  651. return "NVPTXISD::CallPrototype";
  652. case NVPTXISD::ProxyReg:
  653. return "NVPTXISD::ProxyReg";
  654. case NVPTXISD::LoadV2:
  655. return "NVPTXISD::LoadV2";
  656. case NVPTXISD::LoadV4:
  657. return "NVPTXISD::LoadV4";
  658. case NVPTXISD::LDGV2:
  659. return "NVPTXISD::LDGV2";
  660. case NVPTXISD::LDGV4:
  661. return "NVPTXISD::LDGV4";
  662. case NVPTXISD::LDUV2:
  663. return "NVPTXISD::LDUV2";
  664. case NVPTXISD::LDUV4:
  665. return "NVPTXISD::LDUV4";
  666. case NVPTXISD::StoreV2:
  667. return "NVPTXISD::StoreV2";
  668. case NVPTXISD::StoreV4:
  669. return "NVPTXISD::StoreV4";
  670. case NVPTXISD::FUN_SHFL_CLAMP:
  671. return "NVPTXISD::FUN_SHFL_CLAMP";
  672. case NVPTXISD::FUN_SHFR_CLAMP:
  673. return "NVPTXISD::FUN_SHFR_CLAMP";
  674. case NVPTXISD::IMAD:
  675. return "NVPTXISD::IMAD";
  676. case NVPTXISD::SETP_F16X2:
  677. return "NVPTXISD::SETP_F16X2";
  678. case NVPTXISD::Dummy:
  679. return "NVPTXISD::Dummy";
  680. case NVPTXISD::MUL_WIDE_SIGNED:
  681. return "NVPTXISD::MUL_WIDE_SIGNED";
  682. case NVPTXISD::MUL_WIDE_UNSIGNED:
  683. return "NVPTXISD::MUL_WIDE_UNSIGNED";
  684. case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32";
  685. case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat";
  686. case NVPTXISD::Tex1DFloatFloatLevel:
  687. return "NVPTXISD::Tex1DFloatFloatLevel";
  688. case NVPTXISD::Tex1DFloatFloatGrad:
  689. return "NVPTXISD::Tex1DFloatFloatGrad";
  690. case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32";
  691. case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float";
  692. case NVPTXISD::Tex1DS32FloatLevel:
  693. return "NVPTXISD::Tex1DS32FloatLevel";
  694. case NVPTXISD::Tex1DS32FloatGrad:
  695. return "NVPTXISD::Tex1DS32FloatGrad";
  696. case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32";
  697. case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float";
  698. case NVPTXISD::Tex1DU32FloatLevel:
  699. return "NVPTXISD::Tex1DU32FloatLevel";
  700. case NVPTXISD::Tex1DU32FloatGrad:
  701. return "NVPTXISD::Tex1DU32FloatGrad";
  702. case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32";
  703. case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
  704. case NVPTXISD::Tex1DArrayFloatFloatLevel:
  705. return "NVPTXISD::Tex1DArrayFloatFloatLevel";
  706. case NVPTXISD::Tex1DArrayFloatFloatGrad:
  707. return "NVPTXISD::Tex1DArrayFloatFloatGrad";
  708. case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32";
  709. case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float";
  710. case NVPTXISD::Tex1DArrayS32FloatLevel:
  711. return "NVPTXISD::Tex1DArrayS32FloatLevel";
  712. case NVPTXISD::Tex1DArrayS32FloatGrad:
  713. return "NVPTXISD::Tex1DArrayS32FloatGrad";
  714. case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32";
  715. case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float";
  716. case NVPTXISD::Tex1DArrayU32FloatLevel:
  717. return "NVPTXISD::Tex1DArrayU32FloatLevel";
  718. case NVPTXISD::Tex1DArrayU32FloatGrad:
  719. return "NVPTXISD::Tex1DArrayU32FloatGrad";
  720. case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32";
  721. case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat";
  722. case NVPTXISD::Tex2DFloatFloatLevel:
  723. return "NVPTXISD::Tex2DFloatFloatLevel";
  724. case NVPTXISD::Tex2DFloatFloatGrad:
  725. return "NVPTXISD::Tex2DFloatFloatGrad";
  726. case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32";
  727. case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float";
  728. case NVPTXISD::Tex2DS32FloatLevel:
  729. return "NVPTXISD::Tex2DS32FloatLevel";
  730. case NVPTXISD::Tex2DS32FloatGrad:
  731. return "NVPTXISD::Tex2DS32FloatGrad";
  732. case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32";
  733. case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float";
  734. case NVPTXISD::Tex2DU32FloatLevel:
  735. return "NVPTXISD::Tex2DU32FloatLevel";
  736. case NVPTXISD::Tex2DU32FloatGrad:
  737. return "NVPTXISD::Tex2DU32FloatGrad";
  738. case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32";
  739. case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
  740. case NVPTXISD::Tex2DArrayFloatFloatLevel:
  741. return "NVPTXISD::Tex2DArrayFloatFloatLevel";
  742. case NVPTXISD::Tex2DArrayFloatFloatGrad:
  743. return "NVPTXISD::Tex2DArrayFloatFloatGrad";
  744. case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32";
  745. case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float";
  746. case NVPTXISD::Tex2DArrayS32FloatLevel:
  747. return "NVPTXISD::Tex2DArrayS32FloatLevel";
  748. case NVPTXISD::Tex2DArrayS32FloatGrad:
  749. return "NVPTXISD::Tex2DArrayS32FloatGrad";
  750. case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32";
  751. case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float";
  752. case NVPTXISD::Tex2DArrayU32FloatLevel:
  753. return "NVPTXISD::Tex2DArrayU32FloatLevel";
  754. case NVPTXISD::Tex2DArrayU32FloatGrad:
  755. return "NVPTXISD::Tex2DArrayU32FloatGrad";
  756. case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32";
  757. case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat";
  758. case NVPTXISD::Tex3DFloatFloatLevel:
  759. return "NVPTXISD::Tex3DFloatFloatLevel";
  760. case NVPTXISD::Tex3DFloatFloatGrad:
  761. return "NVPTXISD::Tex3DFloatFloatGrad";
  762. case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32";
  763. case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float";
  764. case NVPTXISD::Tex3DS32FloatLevel:
  765. return "NVPTXISD::Tex3DS32FloatLevel";
  766. case NVPTXISD::Tex3DS32FloatGrad:
  767. return "NVPTXISD::Tex3DS32FloatGrad";
  768. case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32";
  769. case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float";
  770. case NVPTXISD::Tex3DU32FloatLevel:
  771. return "NVPTXISD::Tex3DU32FloatLevel";
  772. case NVPTXISD::Tex3DU32FloatGrad:
  773. return "NVPTXISD::Tex3DU32FloatGrad";
  774. case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat";
  775. case NVPTXISD::TexCubeFloatFloatLevel:
  776. return "NVPTXISD::TexCubeFloatFloatLevel";
  777. case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float";
  778. case NVPTXISD::TexCubeS32FloatLevel:
  779. return "NVPTXISD::TexCubeS32FloatLevel";
  780. case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float";
  781. case NVPTXISD::TexCubeU32FloatLevel:
  782. return "NVPTXISD::TexCubeU32FloatLevel";
  783. case NVPTXISD::TexCubeArrayFloatFloat:
  784. return "NVPTXISD::TexCubeArrayFloatFloat";
  785. case NVPTXISD::TexCubeArrayFloatFloatLevel:
  786. return "NVPTXISD::TexCubeArrayFloatFloatLevel";
  787. case NVPTXISD::TexCubeArrayS32Float:
  788. return "NVPTXISD::TexCubeArrayS32Float";
  789. case NVPTXISD::TexCubeArrayS32FloatLevel:
  790. return "NVPTXISD::TexCubeArrayS32FloatLevel";
  791. case NVPTXISD::TexCubeArrayU32Float:
  792. return "NVPTXISD::TexCubeArrayU32Float";
  793. case NVPTXISD::TexCubeArrayU32FloatLevel:
  794. return "NVPTXISD::TexCubeArrayU32FloatLevel";
  795. case NVPTXISD::Tld4R2DFloatFloat:
  796. return "NVPTXISD::Tld4R2DFloatFloat";
  797. case NVPTXISD::Tld4G2DFloatFloat:
  798. return "NVPTXISD::Tld4G2DFloatFloat";
  799. case NVPTXISD::Tld4B2DFloatFloat:
  800. return "NVPTXISD::Tld4B2DFloatFloat";
  801. case NVPTXISD::Tld4A2DFloatFloat:
  802. return "NVPTXISD::Tld4A2DFloatFloat";
  803. case NVPTXISD::Tld4R2DS64Float:
  804. return "NVPTXISD::Tld4R2DS64Float";
  805. case NVPTXISD::Tld4G2DS64Float:
  806. return "NVPTXISD::Tld4G2DS64Float";
  807. case NVPTXISD::Tld4B2DS64Float:
  808. return "NVPTXISD::Tld4B2DS64Float";
  809. case NVPTXISD::Tld4A2DS64Float:
  810. return "NVPTXISD::Tld4A2DS64Float";
  811. case NVPTXISD::Tld4R2DU64Float:
  812. return "NVPTXISD::Tld4R2DU64Float";
  813. case NVPTXISD::Tld4G2DU64Float:
  814. return "NVPTXISD::Tld4G2DU64Float";
  815. case NVPTXISD::Tld4B2DU64Float:
  816. return "NVPTXISD::Tld4B2DU64Float";
  817. case NVPTXISD::Tld4A2DU64Float:
  818. return "NVPTXISD::Tld4A2DU64Float";
  819. case NVPTXISD::TexUnified1DFloatS32:
  820. return "NVPTXISD::TexUnified1DFloatS32";
  821. case NVPTXISD::TexUnified1DFloatFloat:
  822. return "NVPTXISD::TexUnified1DFloatFloat";
  823. case NVPTXISD::TexUnified1DFloatFloatLevel:
  824. return "NVPTXISD::TexUnified1DFloatFloatLevel";
  825. case NVPTXISD::TexUnified1DFloatFloatGrad:
  826. return "NVPTXISD::TexUnified1DFloatFloatGrad";
  827. case NVPTXISD::TexUnified1DS32S32:
  828. return "NVPTXISD::TexUnified1DS32S32";
  829. case NVPTXISD::TexUnified1DS32Float:
  830. return "NVPTXISD::TexUnified1DS32Float";
  831. case NVPTXISD::TexUnified1DS32FloatLevel:
  832. return "NVPTXISD::TexUnified1DS32FloatLevel";
  833. case NVPTXISD::TexUnified1DS32FloatGrad:
  834. return "NVPTXISD::TexUnified1DS32FloatGrad";
  835. case NVPTXISD::TexUnified1DU32S32:
  836. return "NVPTXISD::TexUnified1DU32S32";
  837. case NVPTXISD::TexUnified1DU32Float:
  838. return "NVPTXISD::TexUnified1DU32Float";
  839. case NVPTXISD::TexUnified1DU32FloatLevel:
  840. return "NVPTXISD::TexUnified1DU32FloatLevel";
  841. case NVPTXISD::TexUnified1DU32FloatGrad:
  842. return "NVPTXISD::TexUnified1DU32FloatGrad";
  843. case NVPTXISD::TexUnified1DArrayFloatS32:
  844. return "NVPTXISD::TexUnified1DArrayFloatS32";
  845. case NVPTXISD::TexUnified1DArrayFloatFloat:
  846. return "NVPTXISD::TexUnified1DArrayFloatFloat";
  847. case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
  848. return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
  849. case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
  850. return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
  851. case NVPTXISD::TexUnified1DArrayS32S32:
  852. return "NVPTXISD::TexUnified1DArrayS32S32";
  853. case NVPTXISD::TexUnified1DArrayS32Float:
  854. return "NVPTXISD::TexUnified1DArrayS32Float";
  855. case NVPTXISD::TexUnified1DArrayS32FloatLevel:
  856. return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
  857. case NVPTXISD::TexUnified1DArrayS32FloatGrad:
  858. return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
  859. case NVPTXISD::TexUnified1DArrayU32S32:
  860. return "NVPTXISD::TexUnified1DArrayU32S32";
  861. case NVPTXISD::TexUnified1DArrayU32Float:
  862. return "NVPTXISD::TexUnified1DArrayU32Float";
  863. case NVPTXISD::TexUnified1DArrayU32FloatLevel:
  864. return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
  865. case NVPTXISD::TexUnified1DArrayU32FloatGrad:
  866. return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
  867. case NVPTXISD::TexUnified2DFloatS32:
  868. return "NVPTXISD::TexUnified2DFloatS32";
  869. case NVPTXISD::TexUnified2DFloatFloat:
  870. return "NVPTXISD::TexUnified2DFloatFloat";
  871. case NVPTXISD::TexUnified2DFloatFloatLevel:
  872. return "NVPTXISD::TexUnified2DFloatFloatLevel";
  873. case NVPTXISD::TexUnified2DFloatFloatGrad:
  874. return "NVPTXISD::TexUnified2DFloatFloatGrad";
  875. case NVPTXISD::TexUnified2DS32S32:
  876. return "NVPTXISD::TexUnified2DS32S32";
  877. case NVPTXISD::TexUnified2DS32Float:
  878. return "NVPTXISD::TexUnified2DS32Float";
  879. case NVPTXISD::TexUnified2DS32FloatLevel:
  880. return "NVPTXISD::TexUnified2DS32FloatLevel";
  881. case NVPTXISD::TexUnified2DS32FloatGrad:
  882. return "NVPTXISD::TexUnified2DS32FloatGrad";
  883. case NVPTXISD::TexUnified2DU32S32:
  884. return "NVPTXISD::TexUnified2DU32S32";
  885. case NVPTXISD::TexUnified2DU32Float:
  886. return "NVPTXISD::TexUnified2DU32Float";
  887. case NVPTXISD::TexUnified2DU32FloatLevel:
  888. return "NVPTXISD::TexUnified2DU32FloatLevel";
  889. case NVPTXISD::TexUnified2DU32FloatGrad:
  890. return "NVPTXISD::TexUnified2DU32FloatGrad";
  891. case NVPTXISD::TexUnified2DArrayFloatS32:
  892. return "NVPTXISD::TexUnified2DArrayFloatS32";
  893. case NVPTXISD::TexUnified2DArrayFloatFloat:
  894. return "NVPTXISD::TexUnified2DArrayFloatFloat";
  895. case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
  896. return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
  897. case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
  898. return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
  899. case NVPTXISD::TexUnified2DArrayS32S32:
  900. return "NVPTXISD::TexUnified2DArrayS32S32";
  901. case NVPTXISD::TexUnified2DArrayS32Float:
  902. return "NVPTXISD::TexUnified2DArrayS32Float";
  903. case NVPTXISD::TexUnified2DArrayS32FloatLevel:
  904. return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
  905. case NVPTXISD::TexUnified2DArrayS32FloatGrad:
  906. return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
  907. case NVPTXISD::TexUnified2DArrayU32S32:
  908. return "NVPTXISD::TexUnified2DArrayU32S32";
  909. case NVPTXISD::TexUnified2DArrayU32Float:
  910. return "NVPTXISD::TexUnified2DArrayU32Float";
  911. case NVPTXISD::TexUnified2DArrayU32FloatLevel:
  912. return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
  913. case NVPTXISD::TexUnified2DArrayU32FloatGrad:
  914. return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
  915. case NVPTXISD::TexUnified3DFloatS32:
  916. return "NVPTXISD::TexUnified3DFloatS32";
  917. case NVPTXISD::TexUnified3DFloatFloat:
  918. return "NVPTXISD::TexUnified3DFloatFloat";
  919. case NVPTXISD::TexUnified3DFloatFloatLevel:
  920. return "NVPTXISD::TexUnified3DFloatFloatLevel";
  921. case NVPTXISD::TexUnified3DFloatFloatGrad:
  922. return "NVPTXISD::TexUnified3DFloatFloatGrad";
  923. case NVPTXISD::TexUnified3DS32S32:
  924. return "NVPTXISD::TexUnified3DS32S32";
  925. case NVPTXISD::TexUnified3DS32Float:
  926. return "NVPTXISD::TexUnified3DS32Float";
  927. case NVPTXISD::TexUnified3DS32FloatLevel:
  928. return "NVPTXISD::TexUnified3DS32FloatLevel";
  929. case NVPTXISD::TexUnified3DS32FloatGrad:
  930. return "NVPTXISD::TexUnified3DS32FloatGrad";
  931. case NVPTXISD::TexUnified3DU32S32:
  932. return "NVPTXISD::TexUnified3DU32S32";
  933. case NVPTXISD::TexUnified3DU32Float:
  934. return "NVPTXISD::TexUnified3DU32Float";
  935. case NVPTXISD::TexUnified3DU32FloatLevel:
  936. return "NVPTXISD::TexUnified3DU32FloatLevel";
  937. case NVPTXISD::TexUnified3DU32FloatGrad:
  938. return "NVPTXISD::TexUnified3DU32FloatGrad";
  939. case NVPTXISD::TexUnifiedCubeFloatFloat:
  940. return "NVPTXISD::TexUnifiedCubeFloatFloat";
  941. case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
  942. return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
  943. case NVPTXISD::TexUnifiedCubeS32Float:
  944. return "NVPTXISD::TexUnifiedCubeS32Float";
  945. case NVPTXISD::TexUnifiedCubeS32FloatLevel:
  946. return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
  947. case NVPTXISD::TexUnifiedCubeU32Float:
  948. return "NVPTXISD::TexUnifiedCubeU32Float";
  949. case NVPTXISD::TexUnifiedCubeU32FloatLevel:
  950. return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
  951. case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
  952. return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
  953. case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
  954. return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
  955. case NVPTXISD::TexUnifiedCubeArrayS32Float:
  956. return "NVPTXISD::TexUnifiedCubeArrayS32Float";
  957. case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
  958. return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
  959. case NVPTXISD::TexUnifiedCubeArrayU32Float:
  960. return "NVPTXISD::TexUnifiedCubeArrayU32Float";
  961. case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
  962. return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
  963. case NVPTXISD::Tld4UnifiedR2DFloatFloat:
  964. return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
  965. case NVPTXISD::Tld4UnifiedG2DFloatFloat:
  966. return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
  967. case NVPTXISD::Tld4UnifiedB2DFloatFloat:
  968. return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
  969. case NVPTXISD::Tld4UnifiedA2DFloatFloat:
  970. return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
  971. case NVPTXISD::Tld4UnifiedR2DS64Float:
  972. return "NVPTXISD::Tld4UnifiedR2DS64Float";
  973. case NVPTXISD::Tld4UnifiedG2DS64Float:
  974. return "NVPTXISD::Tld4UnifiedG2DS64Float";
  975. case NVPTXISD::Tld4UnifiedB2DS64Float:
  976. return "NVPTXISD::Tld4UnifiedB2DS64Float";
  977. case NVPTXISD::Tld4UnifiedA2DS64Float:
  978. return "NVPTXISD::Tld4UnifiedA2DS64Float";
  979. case NVPTXISD::Tld4UnifiedR2DU64Float:
  980. return "NVPTXISD::Tld4UnifiedR2DU64Float";
  981. case NVPTXISD::Tld4UnifiedG2DU64Float:
  982. return "NVPTXISD::Tld4UnifiedG2DU64Float";
  983. case NVPTXISD::Tld4UnifiedB2DU64Float:
  984. return "NVPTXISD::Tld4UnifiedB2DU64Float";
  985. case NVPTXISD::Tld4UnifiedA2DU64Float:
  986. return "NVPTXISD::Tld4UnifiedA2DU64Float";
  987. case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp";
  988. case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp";
  989. case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp";
  990. case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp";
  991. case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp";
  992. case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp";
  993. case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp";
  994. case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp";
  995. case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp";
  996. case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp";
  997. case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp";
  998. case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp";
  999. case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp";
  1000. case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp";
  1001. case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp";
  1002. case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
  1003. case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
  1004. case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
  1005. case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
  1006. case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
  1007. case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
  1008. case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
  1009. case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp";
  1010. case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp";
  1011. case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp";
  1012. case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp";
  1013. case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp";
  1014. case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp";
  1015. case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp";
  1016. case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp";
  1017. case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp";
  1018. case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp";
  1019. case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp";
  1020. case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp";
  1021. case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp";
  1022. case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp";
  1023. case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp";
  1024. case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
  1025. case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
  1026. case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
  1027. case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
  1028. case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
  1029. case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
  1030. case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
  1031. case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp";
  1032. case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp";
  1033. case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp";
  1034. case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp";
  1035. case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp";
  1036. case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp";
  1037. case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp";
  1038. case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp";
  1039. case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp";
  1040. case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp";
  1041. case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp";
  1042. case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap";
  1043. case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap";
  1044. case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap";
  1045. case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap";
  1046. case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap";
  1047. case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap";
  1048. case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap";
  1049. case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap";
  1050. case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap";
  1051. case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap";
  1052. case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap";
  1053. case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap";
  1054. case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap";
  1055. case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap";
  1056. case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap";
  1057. case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap";
  1058. case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap";
  1059. case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap";
  1060. case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap";
  1061. case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap";
  1062. case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap";
  1063. case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap";
  1064. case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap";
  1065. case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap";
  1066. case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap";
  1067. case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap";
  1068. case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap";
  1069. case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap";
  1070. case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap";
  1071. case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap";
  1072. case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap";
  1073. case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap";
  1074. case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap";
  1075. case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap";
  1076. case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap";
  1077. case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap";
  1078. case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap";
  1079. case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap";
  1080. case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap";
  1081. case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap";
  1082. case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap";
  1083. case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap";
  1084. case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap";
  1085. case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap";
  1086. case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap";
  1087. case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap";
  1088. case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap";
  1089. case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap";
  1090. case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap";
  1091. case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap";
  1092. case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap";
  1093. case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap";
  1094. case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap";
  1095. case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap";
  1096. case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap";
  1097. case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero";
  1098. case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero";
  1099. case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero";
  1100. case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero";
  1101. case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero";
  1102. case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero";
  1103. case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero";
  1104. case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero";
  1105. case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero";
  1106. case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero";
  1107. case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero";
  1108. case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero";
  1109. case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero";
  1110. case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero";
  1111. case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero";
  1112. case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero";
  1113. case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero";
  1114. case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero";
  1115. case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero";
  1116. case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero";
  1117. case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero";
  1118. case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero";
  1119. case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero";
  1120. case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero";
  1121. case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero";
  1122. case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero";
  1123. case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero";
  1124. case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero";
  1125. case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero";
  1126. case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero";
  1127. case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero";
  1128. case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero";
  1129. case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero";
  1130. case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero";
  1131. case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero";
  1132. case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero";
  1133. case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero";
  1134. case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero";
  1135. case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero";
  1136. case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero";
  1137. case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero";
  1138. case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero";
  1139. case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero";
  1140. case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero";
  1141. case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero";
  1142. case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero";
  1143. case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero";
  1144. case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero";
  1145. case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero";
  1146. case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero";
  1147. case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero";
  1148. case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero";
  1149. case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero";
  1150. case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero";
  1151. case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero";
  1152. }
  1153. return nullptr;
  1154. }
  1155. TargetLoweringBase::LegalizeTypeAction
  1156. NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const {
  1157. if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
  1158. VT.getScalarType() == MVT::i1)
  1159. return TypeSplitVector;
  1160. if (VT == MVT::v2f16)
  1161. return TypeLegal;
  1162. return TargetLoweringBase::getPreferredVectorAction(VT);
  1163. }
  1164. SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
  1165. int Enabled, int &ExtraSteps,
  1166. bool &UseOneConst,
  1167. bool Reciprocal) const {
  1168. if (!(Enabled == ReciprocalEstimate::Enabled ||
  1169. (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
  1170. return SDValue();
  1171. if (ExtraSteps == ReciprocalEstimate::Unspecified)
  1172. ExtraSteps = 0;
  1173. SDLoc DL(Operand);
  1174. EVT VT = Operand.getValueType();
  1175. bool Ftz = useF32FTZ(DAG.getMachineFunction());
  1176. auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
  1177. return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
  1178. DAG.getConstant(IID, DL, MVT::i32), Operand);
  1179. };
  1180. // The sqrt and rsqrt refinement processes assume we always start out with an
  1181. // approximation of the rsqrt. Therefore, if we're going to do any refinement
  1182. // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
  1183. // any refinement, we must return a regular sqrt.
  1184. if (Reciprocal || ExtraSteps > 0) {
  1185. if (VT == MVT::f32)
  1186. return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
  1187. : Intrinsic::nvvm_rsqrt_approx_f);
  1188. else if (VT == MVT::f64)
  1189. return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
  1190. else
  1191. return SDValue();
  1192. } else {
  1193. if (VT == MVT::f32)
  1194. return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
  1195. : Intrinsic::nvvm_sqrt_approx_f);
  1196. else {
  1197. // There's no sqrt.approx.f64 instruction, so we emit
  1198. // reciprocal(rsqrt(x)). This is faster than
  1199. // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
  1200. // x * rsqrt(x).)
  1201. return DAG.getNode(
  1202. ISD::INTRINSIC_WO_CHAIN, DL, VT,
  1203. DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
  1204. MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
  1205. }
  1206. }
  1207. }
  1208. SDValue
  1209. NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
  1210. SDLoc dl(Op);
  1211. const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
  1212. auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
  1213. Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
  1214. return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
  1215. }
  1216. std::string NVPTXTargetLowering::getPrototype(
  1217. const DataLayout &DL, Type *retTy, const ArgListTy &Args,
  1218. const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
  1219. std::optional<std::pair<unsigned, const APInt &>> VAInfo,
  1220. const CallBase &CB, unsigned UniqueCallSite) const {
  1221. auto PtrVT = getPointerTy(DL);
  1222. bool isABI = (STI.getSmVersion() >= 20);
  1223. assert(isABI && "Non-ABI compilation is not supported");
  1224. if (!isABI)
  1225. return "";
  1226. std::string Prototype;
  1227. raw_string_ostream O(Prototype);
  1228. O << "prototype_" << UniqueCallSite << " : .callprototype ";
  1229. if (retTy->getTypeID() == Type::VoidTyID) {
  1230. O << "()";
  1231. } else {
  1232. O << "(";
  1233. if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) {
  1234. unsigned size = 0;
  1235. if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
  1236. size = ITy->getBitWidth();
  1237. } else {
  1238. assert(retTy->isFloatingPointTy() &&
  1239. "Floating point type expected here");
  1240. size = retTy->getPrimitiveSizeInBits();
  1241. }
  1242. // PTX ABI requires all scalar return values to be at least 32
  1243. // bits in size. fp16 normally uses .b16 as its storage type in
  1244. // PTX, so its size must be adjusted here, too.
  1245. size = promoteScalarArgumentSize(size);
  1246. O << ".param .b" << size << " _";
  1247. } else if (isa<PointerType>(retTy)) {
  1248. O << ".param .b" << PtrVT.getSizeInBits() << " _";
  1249. } else if (retTy->isAggregateType() || retTy->isVectorTy() ||
  1250. retTy->isIntegerTy(128)) {
  1251. O << ".param .align " << (retAlignment ? retAlignment->value() : 0)
  1252. << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]";
  1253. } else {
  1254. llvm_unreachable("Unknown return type");
  1255. }
  1256. O << ") ";
  1257. }
  1258. O << "_ (";
  1259. bool first = true;
  1260. const Function *F = CB.getFunction();
  1261. unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
  1262. for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
  1263. Type *Ty = Args[i].Ty;
  1264. if (!first) {
  1265. O << ", ";
  1266. }
  1267. first = false;
  1268. if (!Outs[OIdx].Flags.isByVal()) {
  1269. if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
  1270. unsigned ParamAlign = 0;
  1271. const CallInst *CallI = cast<CallInst>(&CB);
  1272. // +1 because index 0 is reserved for return type alignment
  1273. if (!getAlign(*CallI, i + 1, ParamAlign))
  1274. ParamAlign = getFunctionParamOptimizedAlign(F, Ty, DL).value();
  1275. O << ".param .align " << ParamAlign << " .b8 ";
  1276. O << "_";
  1277. O << "[" << DL.getTypeAllocSize(Ty) << "]";
  1278. // update the index for Outs
  1279. SmallVector<EVT, 16> vtparts;
  1280. ComputeValueVTs(*this, DL, Ty, vtparts);
  1281. if (unsigned len = vtparts.size())
  1282. OIdx += len - 1;
  1283. continue;
  1284. }
  1285. // i8 types in IR will be i16 types in SDAG
  1286. assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
  1287. (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
  1288. "type mismatch between callee prototype and arguments");
  1289. // scalar type
  1290. unsigned sz = 0;
  1291. if (isa<IntegerType>(Ty)) {
  1292. sz = cast<IntegerType>(Ty)->getBitWidth();
  1293. sz = promoteScalarArgumentSize(sz);
  1294. } else if (isa<PointerType>(Ty)) {
  1295. sz = PtrVT.getSizeInBits();
  1296. } else if (Ty->isHalfTy())
  1297. // PTX ABI requires all scalar parameters to be at least 32
  1298. // bits in size. fp16 normally uses .b16 as its storage type
  1299. // in PTX, so its size must be adjusted here, too.
  1300. sz = 32;
  1301. else
  1302. sz = Ty->getPrimitiveSizeInBits();
  1303. O << ".param .b" << sz << " ";
  1304. O << "_";
  1305. continue;
  1306. }
  1307. Type *ETy = Args[i].IndirectType;
  1308. Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
  1309. Align ParamByValAlign =
  1310. getFunctionByValParamAlign(F, ETy, InitialAlign, DL);
  1311. O << ".param .align " << ParamByValAlign.value() << " .b8 ";
  1312. O << "_";
  1313. O << "[" << Outs[OIdx].Flags.getByValSize() << "]";
  1314. }
  1315. if (VAInfo)
  1316. O << (first ? "" : ",") << " .param .align " << VAInfo->second
  1317. << " .b8 _[]\n";
  1318. O << ")";
  1319. if (shouldEmitPTXNoReturn(&CB, *nvTM))
  1320. O << " .noreturn";
  1321. O << ";";
  1322. return Prototype;
  1323. }
  1324. Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
  1325. const CallBase *CB, Type *Ty,
  1326. unsigned Idx,
  1327. const DataLayout &DL) const {
  1328. if (!CB) {
  1329. // CallSite is zero, fallback to ABI type alignment
  1330. return DL.getABITypeAlign(Ty);
  1331. }
  1332. unsigned Alignment = 0;
  1333. const Function *DirectCallee = CB->getCalledFunction();
  1334. if (!DirectCallee) {
  1335. // We don't have a direct function symbol, but that may be because of
  1336. // constant cast instructions in the call.
  1337. // With bitcast'd call targets, the instruction will be the call
  1338. if (const auto *CI = dyn_cast<CallInst>(CB)) {
  1339. // Check if we have call alignment metadata
  1340. if (getAlign(*CI, Idx, Alignment))
  1341. return Align(Alignment);
  1342. }
  1343. DirectCallee = getMaybeBitcastedCallee(CB);
  1344. }
  1345. // Check for function alignment information if we found that the
  1346. // ultimate target is a Function
  1347. if (DirectCallee) {
  1348. if (getAlign(*DirectCallee, Idx, Alignment))
  1349. return Align(Alignment);
  1350. // If alignment information is not available, fall back to the
  1351. // default function param optimized type alignment
  1352. return getFunctionParamOptimizedAlign(DirectCallee, Ty, DL);
  1353. }
  1354. // Call is indirect, fall back to the ABI type alignment
  1355. return DL.getABITypeAlign(Ty);
  1356. }
  1357. SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
  1358. SmallVectorImpl<SDValue> &InVals) const {
  1359. if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
  1360. report_fatal_error(
  1361. "Support for variadic functions (unsized array parameter) introduced "
  1362. "in PTX ISA version 6.0 and requires target sm_30.");
  1363. SelectionDAG &DAG = CLI.DAG;
  1364. SDLoc dl = CLI.DL;
  1365. SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
  1366. SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
  1367. SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
  1368. SDValue Chain = CLI.Chain;
  1369. SDValue Callee = CLI.Callee;
  1370. bool &isTailCall = CLI.IsTailCall;
  1371. ArgListTy &Args = CLI.getArgs();
  1372. Type *RetTy = CLI.RetTy;
  1373. const CallBase *CB = CLI.CB;
  1374. const DataLayout &DL = DAG.getDataLayout();
  1375. bool isABI = (STI.getSmVersion() >= 20);
  1376. assert(isABI && "Non-ABI compilation is not supported");
  1377. if (!isABI)
  1378. return Chain;
  1379. // Variadic arguments.
  1380. //
  1381. // Normally, for each argument, we declare a param scalar or a param
  1382. // byte array in the .param space, and store the argument value to that
  1383. // param scalar or array starting at offset 0.
  1384. //
  1385. // In the case of the first variadic argument, we declare a vararg byte array
  1386. // with size 0. The exact size of this array isn't known at this point, so
  1387. // it'll be patched later. All the variadic arguments will be stored to this
  1388. // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
  1389. // initially set to 0, so it can be used for non-variadic arguments (which use
  1390. // 0 offset) to simplify the code.
  1391. //
  1392. // After all vararg is processed, 'VAOffset' holds the size of the
  1393. // vararg byte array.
  1394. SDValue VADeclareParam; // vararg byte array
  1395. unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic
  1396. unsigned VAOffset = 0; // current offset in the param array
  1397. unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1);
  1398. SDValue TempChain = Chain;
  1399. Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
  1400. SDValue InFlag = Chain.getValue(1);
  1401. unsigned ParamCount = 0;
  1402. // Args.size() and Outs.size() need not match.
  1403. // Outs.size() will be larger
  1404. // * if there is an aggregate argument with multiple fields (each field
  1405. // showing up separately in Outs)
  1406. // * if there is a vector argument with more than typical vector-length
  1407. // elements (generally if more than 4) where each vector element is
  1408. // individually present in Outs.
  1409. // So a different index should be used for indexing into Outs/OutVals.
  1410. // See similar issue in LowerFormalArguments.
  1411. unsigned OIdx = 0;
  1412. // Declare the .params or .reg need to pass values
  1413. // to the function
  1414. for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
  1415. EVT VT = Outs[OIdx].VT;
  1416. Type *Ty = Args[i].Ty;
  1417. bool IsVAArg = (i >= CLI.NumFixedArgs);
  1418. bool IsByVal = Outs[OIdx].Flags.isByVal();
  1419. SmallVector<EVT, 16> VTs;
  1420. SmallVector<uint64_t, 16> Offsets;
  1421. assert((!IsByVal || Args[i].IndirectType) &&
  1422. "byval arg must have indirect type");
  1423. Type *ETy = (IsByVal ? Args[i].IndirectType : Ty);
  1424. ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset);
  1425. Align ArgAlign;
  1426. if (IsByVal) {
  1427. // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
  1428. // so we don't need to worry whether it's naturally aligned or not.
  1429. // See TargetLowering::LowerCallTo().
  1430. Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
  1431. ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy,
  1432. InitialAlign, DL);
  1433. if (IsVAArg)
  1434. VAOffset = alignTo(VAOffset, ArgAlign);
  1435. } else {
  1436. ArgAlign = getArgumentAlignment(Callee, CB, Ty, ParamCount + 1, DL);
  1437. }
  1438. unsigned TypeSize =
  1439. (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty));
  1440. SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
  1441. bool NeedAlign; // Does argument declaration specify alignment?
  1442. if (IsVAArg) {
  1443. if (ParamCount == FirstVAArg) {
  1444. SDValue DeclareParamOps[] = {
  1445. Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32),
  1446. DAG.getConstant(ParamCount, dl, MVT::i32),
  1447. DAG.getConstant(1, dl, MVT::i32), InFlag};
  1448. VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl,
  1449. DeclareParamVTs, DeclareParamOps);
  1450. }
  1451. NeedAlign = IsByVal || Ty->isAggregateType() || Ty->isVectorTy() ||
  1452. Ty->isIntegerTy(128);
  1453. } else if (IsByVal || Ty->isAggregateType() || Ty->isVectorTy() ||
  1454. Ty->isIntegerTy(128)) {
  1455. // declare .param .align <align> .b8 .param<n>[<size>];
  1456. SDValue DeclareParamOps[] = {
  1457. Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
  1458. DAG.getConstant(ParamCount, dl, MVT::i32),
  1459. DAG.getConstant(TypeSize, dl, MVT::i32), InFlag};
  1460. Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
  1461. DeclareParamOps);
  1462. NeedAlign = true;
  1463. } else {
  1464. // declare .param .b<size> .param<n>;
  1465. if (VT.isInteger() || VT.isFloatingPoint()) {
  1466. // PTX ABI requires integral types to be at least 32 bits in
  1467. // size. FP16 is loaded/stored using i16, so it's handled
  1468. // here as well.
  1469. TypeSize = promoteScalarArgumentSize(TypeSize * 8) / 8;
  1470. }
  1471. SDValue DeclareScalarParamOps[] = {
  1472. Chain, DAG.getConstant(ParamCount, dl, MVT::i32),
  1473. DAG.getConstant(TypeSize * 8, dl, MVT::i32),
  1474. DAG.getConstant(0, dl, MVT::i32), InFlag};
  1475. Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
  1476. DeclareScalarParamOps);
  1477. NeedAlign = false;
  1478. }
  1479. InFlag = Chain.getValue(1);
  1480. // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
  1481. // than 32-bits are sign extended or zero extended, depending on
  1482. // whether they are signed or unsigned types. This case applies
  1483. // only to scalar parameters and not to aggregate values.
  1484. bool ExtendIntegerParam =
  1485. Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
  1486. auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
  1487. SmallVector<SDValue, 6> StoreOperands;
  1488. for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
  1489. EVT EltVT = VTs[j];
  1490. int CurOffset = Offsets[j];
  1491. MaybeAlign PartAlign;
  1492. if (NeedAlign)
  1493. PartAlign = commonAlignment(ArgAlign, CurOffset);
  1494. // New store.
  1495. if (VectorInfo[j] & PVF_FIRST) {
  1496. assert(StoreOperands.empty() && "Unfinished preceding store.");
  1497. StoreOperands.push_back(Chain);
  1498. StoreOperands.push_back(
  1499. DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
  1500. StoreOperands.push_back(DAG.getConstant(
  1501. IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
  1502. dl, MVT::i32));
  1503. }
  1504. SDValue StVal = OutVals[OIdx];
  1505. MVT PromotedVT;
  1506. if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
  1507. EltVT = EVT(PromotedVT);
  1508. }
  1509. if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) {
  1510. llvm::ISD::NodeType Ext =
  1511. Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
  1512. StVal = DAG.getNode(Ext, dl, PromotedVT, StVal);
  1513. }
  1514. if (IsByVal) {
  1515. auto PtrVT = getPointerTy(DL);
  1516. SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,
  1517. DAG.getConstant(CurOffset, dl, PtrVT));
  1518. StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(),
  1519. PartAlign);
  1520. } else if (ExtendIntegerParam) {
  1521. assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
  1522. // zext/sext to i32
  1523. StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
  1524. : ISD::ZERO_EXTEND,
  1525. dl, MVT::i32, StVal);
  1526. }
  1527. if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) {
  1528. // Use 16-bit registers for small stores as it's the
  1529. // smallest general purpose register size supported by NVPTX.
  1530. StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
  1531. }
  1532. // Record the value to store.
  1533. StoreOperands.push_back(StVal);
  1534. if (VectorInfo[j] & PVF_LAST) {
  1535. unsigned NumElts = StoreOperands.size() - 3;
  1536. NVPTXISD::NodeType Op;
  1537. switch (NumElts) {
  1538. case 1:
  1539. Op = NVPTXISD::StoreParam;
  1540. break;
  1541. case 2:
  1542. Op = NVPTXISD::StoreParamV2;
  1543. break;
  1544. case 4:
  1545. Op = NVPTXISD::StoreParamV4;
  1546. break;
  1547. default:
  1548. llvm_unreachable("Invalid vector info.");
  1549. }
  1550. StoreOperands.push_back(InFlag);
  1551. // Adjust type of the store op if we've extended the scalar
  1552. // return value.
  1553. EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
  1554. Chain = DAG.getMemIntrinsicNode(
  1555. Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
  1556. TheStoreType, MachinePointerInfo(), PartAlign,
  1557. MachineMemOperand::MOStore);
  1558. InFlag = Chain.getValue(1);
  1559. // Cleanup.
  1560. StoreOperands.clear();
  1561. // TODO: We may need to support vector types that can be passed
  1562. // as scalars in variadic arguments.
  1563. if (!IsByVal && IsVAArg) {
  1564. assert(NumElts == 1 &&
  1565. "Vectorization is expected to be disabled for variadics.");
  1566. VAOffset += DL.getTypeAllocSize(
  1567. TheStoreType.getTypeForEVT(*DAG.getContext()));
  1568. }
  1569. }
  1570. if (!IsByVal)
  1571. ++OIdx;
  1572. }
  1573. assert(StoreOperands.empty() && "Unfinished parameter store.");
  1574. if (!IsByVal && VTs.size() > 0)
  1575. --OIdx;
  1576. ++ParamCount;
  1577. if (IsByVal && IsVAArg)
  1578. VAOffset += TypeSize;
  1579. }
  1580. GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
  1581. MaybeAlign retAlignment = std::nullopt;
  1582. // Handle Result
  1583. if (Ins.size() > 0) {
  1584. SmallVector<EVT, 16> resvtparts;
  1585. ComputeValueVTs(*this, DL, RetTy, resvtparts);
  1586. // Declare
  1587. // .param .align 16 .b8 retval0[<size-in-bytes>], or
  1588. // .param .b<size-in-bits> retval0
  1589. unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
  1590. // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
  1591. // these three types to match the logic in
  1592. // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
  1593. // Plus, this behavior is consistent with nvcc's.
  1594. if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() ||
  1595. (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) {
  1596. resultsz = promoteScalarArgumentSize(resultsz);
  1597. SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
  1598. SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
  1599. DAG.getConstant(resultsz, dl, MVT::i32),
  1600. DAG.getConstant(0, dl, MVT::i32), InFlag };
  1601. Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
  1602. DeclareRetOps);
  1603. InFlag = Chain.getValue(1);
  1604. } else {
  1605. retAlignment = getArgumentAlignment(Callee, CB, RetTy, 0, DL);
  1606. assert(retAlignment && "retAlignment is guaranteed to be set");
  1607. SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
  1608. SDValue DeclareRetOps[] = {
  1609. Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),
  1610. DAG.getConstant(resultsz / 8, dl, MVT::i32),
  1611. DAG.getConstant(0, dl, MVT::i32), InFlag};
  1612. Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
  1613. DeclareRetOps);
  1614. InFlag = Chain.getValue(1);
  1615. }
  1616. }
  1617. bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
  1618. // Set the size of the vararg param byte array if the callee is a variadic
  1619. // function and the variadic part is not empty.
  1620. if (HasVAArgs) {
  1621. SDValue DeclareParamOps[] = {
  1622. VADeclareParam.getOperand(0), VADeclareParam.getOperand(1),
  1623. VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32),
  1624. VADeclareParam.getOperand(4)};
  1625. DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
  1626. VADeclareParam->getVTList(), DeclareParamOps);
  1627. }
  1628. // Both indirect calls and libcalls have nullptr Func. In order to distinguish
  1629. // between them we must rely on the call site value which is valid for
  1630. // indirect calls but is always null for libcalls.
  1631. bool isIndirectCall = !Func && CB;
  1632. if (isa<ExternalSymbolSDNode>(Callee)) {
  1633. Function* CalleeFunc = nullptr;
  1634. // Try to find the callee in the current module.
  1635. Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
  1636. assert(CalleeFunc != nullptr && "Libcall callee must be set.");
  1637. // Set the "libcall callee" attribute to indicate that the function
  1638. // must always have a declaration.
  1639. CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
  1640. }
  1641. if (isIndirectCall) {
  1642. // This is indirect function call case : PTX requires a prototype of the
  1643. // form
  1644. // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
  1645. // to be emitted, and the label has to used as the last arg of call
  1646. // instruction.
  1647. // The prototype is embedded in a string and put as the operand for a
  1648. // CallPrototype SDNode which will print out to the value of the string.
  1649. SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
  1650. std::string Proto = getPrototype(
  1651. DL, RetTy, Args, Outs, retAlignment,
  1652. HasVAArgs
  1653. ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair(
  1654. CLI.NumFixedArgs,
  1655. cast<ConstantSDNode>(VADeclareParam->getOperand(1))
  1656. ->getAPIntValue()))
  1657. : std::nullopt,
  1658. *CB, UniqueCallSite);
  1659. const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
  1660. SDValue ProtoOps[] = {
  1661. Chain,
  1662. DAG.getTargetExternalSymbol(ProtoStr, MVT::i32),
  1663. InFlag,
  1664. };
  1665. Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
  1666. InFlag = Chain.getValue(1);
  1667. }
  1668. // Op to just print "call"
  1669. SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
  1670. SDValue PrintCallOps[] = {
  1671. Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
  1672. };
  1673. // We model convergent calls as separate opcodes.
  1674. unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni;
  1675. if (CLI.IsConvergent)
  1676. Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
  1677. : NVPTXISD::PrintConvergentCall;
  1678. Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
  1679. InFlag = Chain.getValue(1);
  1680. // Ops to print out the function name
  1681. SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
  1682. SDValue CallVoidOps[] = { Chain, Callee, InFlag };
  1683. Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
  1684. InFlag = Chain.getValue(1);
  1685. // Ops to print out the param list
  1686. SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
  1687. SDValue CallArgBeginOps[] = { Chain, InFlag };
  1688. Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
  1689. CallArgBeginOps);
  1690. InFlag = Chain.getValue(1);
  1691. for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e;
  1692. ++i) {
  1693. unsigned opcode;
  1694. if (i == (e - 1))
  1695. opcode = NVPTXISD::LastCallArg;
  1696. else
  1697. opcode = NVPTXISD::CallArg;
  1698. SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
  1699. SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
  1700. DAG.getConstant(i, dl, MVT::i32), InFlag };
  1701. Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
  1702. InFlag = Chain.getValue(1);
  1703. }
  1704. SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
  1705. SDValue CallArgEndOps[] = { Chain,
  1706. DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
  1707. InFlag };
  1708. Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
  1709. InFlag = Chain.getValue(1);
  1710. if (isIndirectCall) {
  1711. SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
  1712. SDValue PrototypeOps[] = {
  1713. Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InFlag};
  1714. Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
  1715. InFlag = Chain.getValue(1);
  1716. }
  1717. SmallVector<SDValue, 16> ProxyRegOps;
  1718. SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;
  1719. // Generate loads from param memory/moves from registers for result
  1720. if (Ins.size() > 0) {
  1721. SmallVector<EVT, 16> VTs;
  1722. SmallVector<uint64_t, 16> Offsets;
  1723. ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
  1724. assert(VTs.size() == Ins.size() && "Bad value decomposition");
  1725. Align RetAlign = getArgumentAlignment(Callee, CB, RetTy, 0, DL);
  1726. auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
  1727. SmallVector<EVT, 6> LoadVTs;
  1728. int VecIdx = -1; // Index of the first element of the vector.
  1729. // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
  1730. // 32-bits are sign extended or zero extended, depending on whether
  1731. // they are signed or unsigned types.
  1732. bool ExtendIntegerRetVal =
  1733. RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
  1734. for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
  1735. bool needTruncate = false;
  1736. EVT TheLoadType = VTs[i];
  1737. EVT EltType = Ins[i].VT;
  1738. Align EltAlign = commonAlignment(RetAlign, Offsets[i]);
  1739. MVT PromotedVT;
  1740. if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) {
  1741. TheLoadType = EVT(PromotedVT);
  1742. EltType = EVT(PromotedVT);
  1743. needTruncate = true;
  1744. }
  1745. if (ExtendIntegerRetVal) {
  1746. TheLoadType = MVT::i32;
  1747. EltType = MVT::i32;
  1748. needTruncate = true;
  1749. } else if (TheLoadType.getSizeInBits() < 16) {
  1750. if (VTs[i].isInteger())
  1751. needTruncate = true;
  1752. EltType = MVT::i16;
  1753. }
  1754. // Record index of the very first element of the vector.
  1755. if (VectorInfo[i] & PVF_FIRST) {
  1756. assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
  1757. VecIdx = i;
  1758. }
  1759. LoadVTs.push_back(EltType);
  1760. if (VectorInfo[i] & PVF_LAST) {
  1761. unsigned NumElts = LoadVTs.size();
  1762. LoadVTs.push_back(MVT::Other);
  1763. LoadVTs.push_back(MVT::Glue);
  1764. NVPTXISD::NodeType Op;
  1765. switch (NumElts) {
  1766. case 1:
  1767. Op = NVPTXISD::LoadParam;
  1768. break;
  1769. case 2:
  1770. Op = NVPTXISD::LoadParamV2;
  1771. break;
  1772. case 4:
  1773. Op = NVPTXISD::LoadParamV4;
  1774. break;
  1775. default:
  1776. llvm_unreachable("Invalid vector info.");
  1777. }
  1778. SDValue LoadOperands[] = {
  1779. Chain, DAG.getConstant(1, dl, MVT::i32),
  1780. DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
  1781. SDValue RetVal = DAG.getMemIntrinsicNode(
  1782. Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
  1783. MachinePointerInfo(), EltAlign,
  1784. MachineMemOperand::MOLoad);
  1785. for (unsigned j = 0; j < NumElts; ++j) {
  1786. ProxyRegOps.push_back(RetVal.getValue(j));
  1787. if (needTruncate)
  1788. ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT));
  1789. else
  1790. ProxyRegTruncates.push_back(std::optional<MVT>());
  1791. }
  1792. Chain = RetVal.getValue(NumElts);
  1793. InFlag = RetVal.getValue(NumElts + 1);
  1794. // Cleanup
  1795. VecIdx = -1;
  1796. LoadVTs.clear();
  1797. }
  1798. }
  1799. }
  1800. Chain =
  1801. DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InFlag, dl);
  1802. InFlag = Chain.getValue(1);
  1803. // Append ProxyReg instructions to the chain to make sure that `callseq_end`
  1804. // will not get lost. Otherwise, during libcalls expansion, the nodes can become
  1805. // dangling.
  1806. for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
  1807. SDValue Ret = DAG.getNode(
  1808. NVPTXISD::ProxyReg, dl,
  1809. DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
  1810. { Chain, ProxyRegOps[i], InFlag }
  1811. );
  1812. Chain = Ret.getValue(1);
  1813. InFlag = Ret.getValue(2);
  1814. if (ProxyRegTruncates[i]) {
  1815. Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret);
  1816. }
  1817. InVals.push_back(Ret);
  1818. }
  1819. // set isTailCall to false for now, until we figure out how to express
  1820. // tail call optimization in PTX
  1821. isTailCall = false;
  1822. return Chain;
  1823. }
  1824. // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
  1825. // (see LegalizeDAG.cpp). This is slow and uses local memory.
  1826. // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
  1827. SDValue
  1828. NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
  1829. SDNode *Node = Op.getNode();
  1830. SDLoc dl(Node);
  1831. SmallVector<SDValue, 8> Ops;
  1832. unsigned NumOperands = Node->getNumOperands();
  1833. for (unsigned i = 0; i < NumOperands; ++i) {
  1834. SDValue SubOp = Node->getOperand(i);
  1835. EVT VVT = SubOp.getNode()->getValueType(0);
  1836. EVT EltVT = VVT.getVectorElementType();
  1837. unsigned NumSubElem = VVT.getVectorNumElements();
  1838. for (unsigned j = 0; j < NumSubElem; ++j) {
  1839. Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
  1840. DAG.getIntPtrConstant(j, dl)));
  1841. }
  1842. }
  1843. return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
  1844. }
  1845. // We can init constant f16x2 with a single .b32 move. Normally it
  1846. // would get lowered as two constant loads and vector-packing move.
  1847. // mov.b16 %h1, 0x4000;
  1848. // mov.b16 %h2, 0x3C00;
  1849. // mov.b32 %hh2, {%h2, %h1};
  1850. // Instead we want just a constant move:
  1851. // mov.b32 %hh2, 0x40003C00
  1852. //
  1853. // This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
  1854. // generates good SASS in both cases.
  1855. SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
  1856. SelectionDAG &DAG) const {
  1857. if (!(Op->getValueType(0) == MVT::v2f16 &&
  1858. isa<ConstantFPSDNode>(Op->getOperand(0)) &&
  1859. isa<ConstantFPSDNode>(Op->getOperand(1))))
  1860. return Op;
  1861. APInt E0 =
  1862. cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt();
  1863. APInt E1 =
  1864. cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt();
  1865. SDValue Const =
  1866. DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
  1867. return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const);
  1868. }
  1869. SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
  1870. SelectionDAG &DAG) const {
  1871. SDValue Index = Op->getOperand(1);
  1872. // Constant index will be matched by tablegen.
  1873. if (isa<ConstantSDNode>(Index.getNode()))
  1874. return Op;
  1875. // Extract individual elements and select one of them.
  1876. SDValue Vector = Op->getOperand(0);
  1877. EVT VectorVT = Vector.getValueType();
  1878. assert(VectorVT == MVT::v2f16 && "Unexpected vector type.");
  1879. EVT EltVT = VectorVT.getVectorElementType();
  1880. SDLoc dl(Op.getNode());
  1881. SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
  1882. DAG.getIntPtrConstant(0, dl));
  1883. SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
  1884. DAG.getIntPtrConstant(1, dl));
  1885. return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
  1886. ISD::CondCode::SETEQ);
  1887. }
  1888. /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
  1889. /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
  1890. /// amount, or
  1891. /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
  1892. /// amount.
  1893. SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
  1894. SelectionDAG &DAG) const {
  1895. assert(Op.getNumOperands() == 3 && "Not a double-shift!");
  1896. assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
  1897. EVT VT = Op.getValueType();
  1898. unsigned VTBits = VT.getSizeInBits();
  1899. SDLoc dl(Op);
  1900. SDValue ShOpLo = Op.getOperand(0);
  1901. SDValue ShOpHi = Op.getOperand(1);
  1902. SDValue ShAmt = Op.getOperand(2);
  1903. unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
  1904. if (VTBits == 32 && STI.getSmVersion() >= 35) {
  1905. // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
  1906. // {dHi, dLo} = {aHi, aLo} >> Amt
  1907. // dHi = aHi >> Amt
  1908. // dLo = shf.r.clamp aLo, aHi, Amt
  1909. SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
  1910. SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
  1911. ShAmt);
  1912. SDValue Ops[2] = { Lo, Hi };
  1913. return DAG.getMergeValues(Ops, dl);
  1914. }
  1915. else {
  1916. // {dHi, dLo} = {aHi, aLo} >> Amt
  1917. // - if (Amt>=size) then
  1918. // dLo = aHi >> (Amt-size)
  1919. // dHi = aHi >> Amt (this is either all 0 or all 1)
  1920. // else
  1921. // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
  1922. // dHi = aHi >> Amt
  1923. SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
  1924. DAG.getConstant(VTBits, dl, MVT::i32),
  1925. ShAmt);
  1926. SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
  1927. SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
  1928. DAG.getConstant(VTBits, dl, MVT::i32));
  1929. SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
  1930. SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
  1931. SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
  1932. SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
  1933. DAG.getConstant(VTBits, dl, MVT::i32),
  1934. ISD::SETGE);
  1935. SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
  1936. SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
  1937. SDValue Ops[2] = { Lo, Hi };
  1938. return DAG.getMergeValues(Ops, dl);
  1939. }
  1940. }
  1941. /// LowerShiftLeftParts - Lower SHL_PARTS, which
  1942. /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
  1943. /// amount, or
  1944. /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
  1945. /// amount.
  1946. SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
  1947. SelectionDAG &DAG) const {
  1948. assert(Op.getNumOperands() == 3 && "Not a double-shift!");
  1949. assert(Op.getOpcode() == ISD::SHL_PARTS);
  1950. EVT VT = Op.getValueType();
  1951. unsigned VTBits = VT.getSizeInBits();
  1952. SDLoc dl(Op);
  1953. SDValue ShOpLo = Op.getOperand(0);
  1954. SDValue ShOpHi = Op.getOperand(1);
  1955. SDValue ShAmt = Op.getOperand(2);
  1956. if (VTBits == 32 && STI.getSmVersion() >= 35) {
  1957. // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
  1958. // {dHi, dLo} = {aHi, aLo} << Amt
  1959. // dHi = shf.l.clamp aLo, aHi, Amt
  1960. // dLo = aLo << Amt
  1961. SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
  1962. ShAmt);
  1963. SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
  1964. SDValue Ops[2] = { Lo, Hi };
  1965. return DAG.getMergeValues(Ops, dl);
  1966. }
  1967. else {
  1968. // {dHi, dLo} = {aHi, aLo} << Amt
  1969. // - if (Amt>=size) then
  1970. // dLo = aLo << Amt (all 0)
  1971. // dLo = aLo << (Amt-size)
  1972. // else
  1973. // dLo = aLo << Amt
  1974. // dHi = (aHi << Amt) | (aLo >> (size-Amt))
  1975. SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
  1976. DAG.getConstant(VTBits, dl, MVT::i32),
  1977. ShAmt);
  1978. SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
  1979. SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
  1980. DAG.getConstant(VTBits, dl, MVT::i32));
  1981. SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
  1982. SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
  1983. SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
  1984. SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
  1985. DAG.getConstant(VTBits, dl, MVT::i32),
  1986. ISD::SETGE);
  1987. SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
  1988. SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
  1989. SDValue Ops[2] = { Lo, Hi };
  1990. return DAG.getMergeValues(Ops, dl);
  1991. }
  1992. }
  1993. SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
  1994. EVT VT = Op.getValueType();
  1995. if (VT == MVT::f32)
  1996. return LowerFROUND32(Op, DAG);
  1997. if (VT == MVT::f64)
  1998. return LowerFROUND64(Op, DAG);
  1999. llvm_unreachable("unhandled type");
  2000. }
  2001. // This is the the rounding method used in CUDA libdevice in C like code:
  2002. // float roundf(float A)
  2003. // {
  2004. // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
  2005. // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
  2006. // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
  2007. // }
  2008. SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
  2009. SelectionDAG &DAG) const {
  2010. SDLoc SL(Op);
  2011. SDValue A = Op.getOperand(0);
  2012. EVT VT = Op.getValueType();
  2013. SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
  2014. // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
  2015. SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
  2016. const int SignBitMask = 0x80000000;
  2017. SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
  2018. DAG.getConstant(SignBitMask, SL, MVT::i32));
  2019. const int PointFiveInBits = 0x3F000000;
  2020. SDValue PointFiveWithSignRaw =
  2021. DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
  2022. DAG.getConstant(PointFiveInBits, SL, MVT::i32));
  2023. SDValue PointFiveWithSign =
  2024. DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
  2025. SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
  2026. SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
  2027. // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
  2028. EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
  2029. SDValue IsLarge =
  2030. DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
  2031. ISD::SETOGT);
  2032. RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
  2033. // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
  2034. SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
  2035. DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
  2036. SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
  2037. return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
  2038. }
  2039. // The implementation of round(double) is similar to that of round(float) in
  2040. // that they both separate the value range into three regions and use a method
  2041. // specific to the region to round the values. However, round(double) first
  2042. // calculates the round of the absolute value and then adds the sign back while
  2043. // round(float) directly rounds the value with sign.
  2044. SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
  2045. SelectionDAG &DAG) const {
  2046. SDLoc SL(Op);
  2047. SDValue A = Op.getOperand(0);
  2048. EVT VT = Op.getValueType();
  2049. SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
  2050. // double RoundedA = (double) (int) (abs(A) + 0.5f);
  2051. SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
  2052. DAG.getConstantFP(0.5, SL, VT));
  2053. SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
  2054. // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
  2055. EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
  2056. SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
  2057. DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
  2058. RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
  2059. DAG.getConstantFP(0, SL, VT),
  2060. RoundedA);
  2061. // Add sign to rounded_A
  2062. RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
  2063. DAG.getNode(ISD::FTRUNC, SL, VT, A);
  2064. // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
  2065. SDValue IsLarge =
  2066. DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
  2067. ISD::SETOGT);
  2068. return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
  2069. }
  2070. SDValue
  2071. NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
  2072. switch (Op.getOpcode()) {
  2073. case ISD::RETURNADDR:
  2074. return SDValue();
  2075. case ISD::FRAMEADDR:
  2076. return SDValue();
  2077. case ISD::GlobalAddress:
  2078. return LowerGlobalAddress(Op, DAG);
  2079. case ISD::INTRINSIC_W_CHAIN:
  2080. return Op;
  2081. case ISD::BUILD_VECTOR:
  2082. return LowerBUILD_VECTOR(Op, DAG);
  2083. case ISD::EXTRACT_SUBVECTOR:
  2084. return Op;
  2085. case ISD::EXTRACT_VECTOR_ELT:
  2086. return LowerEXTRACT_VECTOR_ELT(Op, DAG);
  2087. case ISD::CONCAT_VECTORS:
  2088. return LowerCONCAT_VECTORS(Op, DAG);
  2089. case ISD::STORE:
  2090. return LowerSTORE(Op, DAG);
  2091. case ISD::LOAD:
  2092. return LowerLOAD(Op, DAG);
  2093. case ISD::SHL_PARTS:
  2094. return LowerShiftLeftParts(Op, DAG);
  2095. case ISD::SRA_PARTS:
  2096. case ISD::SRL_PARTS:
  2097. return LowerShiftRightParts(Op, DAG);
  2098. case ISD::SELECT:
  2099. return LowerSelect(Op, DAG);
  2100. case ISD::FROUND:
  2101. return LowerFROUND(Op, DAG);
  2102. case ISD::VAARG:
  2103. return LowerVAARG(Op, DAG);
  2104. case ISD::VASTART:
  2105. return LowerVASTART(Op, DAG);
  2106. default:
  2107. llvm_unreachable("Custom lowering not defined for operation");
  2108. }
  2109. }
  2110. // This function is almost a copy of SelectionDAG::expandVAArg().
  2111. // The only diff is that this one produces loads from local address space.
  2112. SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
  2113. const TargetLowering *TLI = STI.getTargetLowering();
  2114. SDLoc DL(Op);
  2115. SDNode *Node = Op.getNode();
  2116. const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
  2117. EVT VT = Node->getValueType(0);
  2118. auto *Ty = VT.getTypeForEVT(*DAG.getContext());
  2119. SDValue Tmp1 = Node->getOperand(0);
  2120. SDValue Tmp2 = Node->getOperand(1);
  2121. const MaybeAlign MA(Node->getConstantOperandVal(3));
  2122. SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
  2123. Tmp1, Tmp2, MachinePointerInfo(V));
  2124. SDValue VAList = VAListLoad;
  2125. if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
  2126. VAList = DAG.getNode(
  2127. ISD::ADD, DL, VAList.getValueType(), VAList,
  2128. DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
  2129. VAList = DAG.getNode(
  2130. ISD::AND, DL, VAList.getValueType(), VAList,
  2131. DAG.getConstant(-(int64_t)MA->value(), DL, VAList.getValueType()));
  2132. }
  2133. // Increment the pointer, VAList, to the next vaarg
  2134. Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
  2135. DAG.getConstant(DAG.getDataLayout().getTypeAllocSize(Ty),
  2136. DL, VAList.getValueType()));
  2137. // Store the incremented VAList to the legalized pointer
  2138. Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
  2139. MachinePointerInfo(V));
  2140. const Value *SrcV =
  2141. Constant::getNullValue(PointerType::get(Ty, ADDRESS_SPACE_LOCAL));
  2142. // Load the actual argument out of the pointer VAList
  2143. return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
  2144. }
  2145. SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
  2146. const TargetLowering *TLI = STI.getTargetLowering();
  2147. SDLoc DL(Op);
  2148. EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
  2149. // Store the address of unsized array <function>_vararg[] in the ap object.
  2150. SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
  2151. SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg);
  2152. const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
  2153. return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
  2154. MachinePointerInfo(SV));
  2155. }
  2156. SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
  2157. SDValue Op0 = Op->getOperand(0);
  2158. SDValue Op1 = Op->getOperand(1);
  2159. SDValue Op2 = Op->getOperand(2);
  2160. SDLoc DL(Op.getNode());
  2161. assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
  2162. Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
  2163. Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
  2164. SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
  2165. SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
  2166. return Trunc;
  2167. }
  2168. SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
  2169. if (Op.getValueType() == MVT::i1)
  2170. return LowerLOADi1(Op, DAG);
  2171. // v2f16 is legal, so we can't rely on legalizer to handle unaligned
  2172. // loads and have to handle it here.
  2173. if (Op.getValueType() == MVT::v2f16) {
  2174. LoadSDNode *Load = cast<LoadSDNode>(Op);
  2175. EVT MemVT = Load->getMemoryVT();
  2176. if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
  2177. MemVT, *Load->getMemOperand())) {
  2178. SDValue Ops[2];
  2179. std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
  2180. return DAG.getMergeValues(Ops, SDLoc(Op));
  2181. }
  2182. }
  2183. return SDValue();
  2184. }
  2185. // v = ld i1* addr
  2186. // =>
  2187. // v1 = ld i8* addr (-> i16)
  2188. // v = trunc i16 to i1
  2189. SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
  2190. SDNode *Node = Op.getNode();
  2191. LoadSDNode *LD = cast<LoadSDNode>(Node);
  2192. SDLoc dl(Node);
  2193. assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
  2194. assert(Node->getValueType(0) == MVT::i1 &&
  2195. "Custom lowering for i1 load only");
  2196. SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
  2197. LD->getPointerInfo(), LD->getAlign(),
  2198. LD->getMemOperand()->getFlags());
  2199. SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
  2200. // The legalizer (the caller) is expecting two values from the legalized
  2201. // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
  2202. // in LegalizeDAG.cpp which also uses MergeValues.
  2203. SDValue Ops[] = { result, LD->getChain() };
  2204. return DAG.getMergeValues(Ops, dl);
  2205. }
  2206. SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
  2207. StoreSDNode *Store = cast<StoreSDNode>(Op);
  2208. EVT VT = Store->getMemoryVT();
  2209. if (VT == MVT::i1)
  2210. return LowerSTOREi1(Op, DAG);
  2211. // v2f16 is legal, so we can't rely on legalizer to handle unaligned
  2212. // stores and have to handle it here.
  2213. if (VT == MVT::v2f16 &&
  2214. !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
  2215. VT, *Store->getMemOperand()))
  2216. return expandUnalignedStore(Store, DAG);
  2217. if (VT.isVector())
  2218. return LowerSTOREVector(Op, DAG);
  2219. return SDValue();
  2220. }
  2221. SDValue
  2222. NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
  2223. SDNode *N = Op.getNode();
  2224. SDValue Val = N->getOperand(1);
  2225. SDLoc DL(N);
  2226. EVT ValVT = Val.getValueType();
  2227. if (ValVT.isVector()) {
  2228. // We only handle "native" vector sizes for now, e.g. <4 x double> is not
  2229. // legal. We can (and should) split that into 2 stores of <2 x double> here
  2230. // but I'm leaving that as a TODO for now.
  2231. if (!ValVT.isSimple())
  2232. return SDValue();
  2233. switch (ValVT.getSimpleVT().SimpleTy) {
  2234. default:
  2235. return SDValue();
  2236. case MVT::v2i8:
  2237. case MVT::v2i16:
  2238. case MVT::v2i32:
  2239. case MVT::v2i64:
  2240. case MVT::v2f16:
  2241. case MVT::v2bf16:
  2242. case MVT::v2f32:
  2243. case MVT::v2f64:
  2244. case MVT::v4i8:
  2245. case MVT::v4i16:
  2246. case MVT::v4i32:
  2247. case MVT::v4f16:
  2248. case MVT::v4bf16:
  2249. case MVT::v4f32:
  2250. case MVT::v8f16: // <4 x f16x2>
  2251. case MVT::v8bf16: // <4 x bf16x2>
  2252. // This is a "native" vector type
  2253. break;
  2254. }
  2255. MemSDNode *MemSD = cast<MemSDNode>(N);
  2256. const DataLayout &TD = DAG.getDataLayout();
  2257. Align Alignment = MemSD->getAlign();
  2258. Align PrefAlign =
  2259. TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
  2260. if (Alignment < PrefAlign) {
  2261. // This store is not sufficiently aligned, so bail out and let this vector
  2262. // store be scalarized. Note that we may still be able to emit smaller
  2263. // vector stores. For example, if we are storing a <4 x float> with an
  2264. // alignment of 8, this check will fail but the legalizer will try again
  2265. // with 2 x <2 x float>, which will succeed with an alignment of 8.
  2266. return SDValue();
  2267. }
  2268. unsigned Opcode = 0;
  2269. EVT EltVT = ValVT.getVectorElementType();
  2270. unsigned NumElts = ValVT.getVectorNumElements();
  2271. // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
  2272. // Therefore, we must ensure the type is legal. For i1 and i8, we set the
  2273. // stored type to i16 and propagate the "real" type as the memory type.
  2274. bool NeedExt = false;
  2275. if (EltVT.getSizeInBits() < 16)
  2276. NeedExt = true;
  2277. bool StoreF16x2 = false;
  2278. switch (NumElts) {
  2279. default:
  2280. return SDValue();
  2281. case 2:
  2282. Opcode = NVPTXISD::StoreV2;
  2283. break;
  2284. case 4:
  2285. Opcode = NVPTXISD::StoreV4;
  2286. break;
  2287. case 8:
  2288. // v8f16 is a special case. PTX doesn't have st.v8.f16
  2289. // instruction. Instead, we split the vector into v2f16 chunks and
  2290. // store them with st.v4.b32.
  2291. assert((EltVT == MVT::f16 || EltVT == MVT::bf16) &&
  2292. "Wrong type for the vector.");
  2293. Opcode = NVPTXISD::StoreV4;
  2294. StoreF16x2 = true;
  2295. break;
  2296. }
  2297. SmallVector<SDValue, 8> Ops;
  2298. // First is the chain
  2299. Ops.push_back(N->getOperand(0));
  2300. if (StoreF16x2) {
  2301. // Combine f16,f16 -> v2f16
  2302. NumElts /= 2;
  2303. for (unsigned i = 0; i < NumElts; ++i) {
  2304. SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
  2305. DAG.getIntPtrConstant(i * 2, DL));
  2306. SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
  2307. DAG.getIntPtrConstant(i * 2 + 1, DL));
  2308. SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1);
  2309. Ops.push_back(V2);
  2310. }
  2311. } else {
  2312. // Then the split values
  2313. for (unsigned i = 0; i < NumElts; ++i) {
  2314. SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
  2315. DAG.getIntPtrConstant(i, DL));
  2316. if (NeedExt)
  2317. ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
  2318. Ops.push_back(ExtVal);
  2319. }
  2320. }
  2321. // Then any remaining arguments
  2322. Ops.append(N->op_begin() + 2, N->op_end());
  2323. SDValue NewSt =
  2324. DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
  2325. MemSD->getMemoryVT(), MemSD->getMemOperand());
  2326. // return DCI.CombineTo(N, NewSt, true);
  2327. return NewSt;
  2328. }
  2329. return SDValue();
  2330. }
  2331. // st i1 v, addr
  2332. // =>
  2333. // v1 = zxt v to i16
  2334. // st.u8 i16, addr
  2335. SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
  2336. SDNode *Node = Op.getNode();
  2337. SDLoc dl(Node);
  2338. StoreSDNode *ST = cast<StoreSDNode>(Node);
  2339. SDValue Tmp1 = ST->getChain();
  2340. SDValue Tmp2 = ST->getBasePtr();
  2341. SDValue Tmp3 = ST->getValue();
  2342. assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
  2343. Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
  2344. SDValue Result =
  2345. DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
  2346. ST->getAlign(), ST->getMemOperand()->getFlags());
  2347. return Result;
  2348. }
  2349. // This creates target external symbol for a function parameter.
  2350. // Name of the symbol is composed from its index and the function name.
  2351. // Negative index corresponds to special parameter (unsized array) used for
  2352. // passing variable arguments.
  2353. SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx,
  2354. EVT v) const {
  2355. std::string ParamSym;
  2356. raw_string_ostream ParamStr(ParamSym);
  2357. ParamStr << DAG.getMachineFunction().getName();
  2358. if (idx < 0)
  2359. ParamStr << "_vararg";
  2360. else
  2361. ParamStr << "_param_" << idx;
  2362. StringRef SavedStr =
  2363. nvTM->getStrPool().save(ParamSym);
  2364. return DAG.getTargetExternalSymbol(SavedStr.data(), v);
  2365. }
  2366. SDValue NVPTXTargetLowering::LowerFormalArguments(
  2367. SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
  2368. const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
  2369. SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
  2370. MachineFunction &MF = DAG.getMachineFunction();
  2371. const DataLayout &DL = DAG.getDataLayout();
  2372. auto PtrVT = getPointerTy(DAG.getDataLayout());
  2373. const Function *F = &MF.getFunction();
  2374. const AttributeList &PAL = F->getAttributes();
  2375. const TargetLowering *TLI = STI.getTargetLowering();
  2376. SDValue Root = DAG.getRoot();
  2377. std::vector<SDValue> OutChains;
  2378. bool isABI = (STI.getSmVersion() >= 20);
  2379. assert(isABI && "Non-ABI compilation is not supported");
  2380. if (!isABI)
  2381. return Chain;
  2382. std::vector<Type *> argTypes;
  2383. std::vector<const Argument *> theArgs;
  2384. for (const Argument &I : F->args()) {
  2385. theArgs.push_back(&I);
  2386. argTypes.push_back(I.getType());
  2387. }
  2388. // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
  2389. // Ins.size() will be larger
  2390. // * if there is an aggregate argument with multiple fields (each field
  2391. // showing up separately in Ins)
  2392. // * if there is a vector argument with more than typical vector-length
  2393. // elements (generally if more than 4) where each vector element is
  2394. // individually present in Ins.
  2395. // So a different index should be used for indexing into Ins.
  2396. // See similar issue in LowerCall.
  2397. unsigned InsIdx = 0;
  2398. int idx = 0;
  2399. for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
  2400. Type *Ty = argTypes[i];
  2401. if (theArgs[i]->use_empty()) {
  2402. // argument is dead
  2403. if (Ty->isAggregateType() || Ty->isIntegerTy(128)) {
  2404. SmallVector<EVT, 16> vtparts;
  2405. ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
  2406. assert(vtparts.size() > 0 && "empty aggregate type not expected");
  2407. for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
  2408. ++parti) {
  2409. InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
  2410. ++InsIdx;
  2411. }
  2412. if (vtparts.size() > 0)
  2413. --InsIdx;
  2414. continue;
  2415. }
  2416. if (Ty->isVectorTy()) {
  2417. EVT ObjectVT = getValueType(DL, Ty);
  2418. unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
  2419. for (unsigned parti = 0; parti < NumRegs; ++parti) {
  2420. InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
  2421. ++InsIdx;
  2422. }
  2423. if (NumRegs > 0)
  2424. --InsIdx;
  2425. continue;
  2426. }
  2427. InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
  2428. continue;
  2429. }
  2430. // In the following cases, assign a node order of "idx+1"
  2431. // to newly created nodes. The SDNodes for params have to
  2432. // appear in the same order as their order of appearance
  2433. // in the original function. "idx+1" holds that order.
  2434. if (!PAL.hasParamAttr(i, Attribute::ByVal)) {
  2435. bool aggregateIsPacked = false;
  2436. if (StructType *STy = dyn_cast<StructType>(Ty))
  2437. aggregateIsPacked = STy->isPacked();
  2438. SmallVector<EVT, 16> VTs;
  2439. SmallVector<uint64_t, 16> Offsets;
  2440. ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
  2441. assert(VTs.size() > 0 && "Unexpected empty type.");
  2442. auto VectorInfo =
  2443. VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty));
  2444. SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
  2445. int VecIdx = -1; // Index of the first element of the current vector.
  2446. for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
  2447. if (VectorInfo[parti] & PVF_FIRST) {
  2448. assert(VecIdx == -1 && "Orphaned vector.");
  2449. VecIdx = parti;
  2450. }
  2451. // That's the last element of this store op.
  2452. if (VectorInfo[parti] & PVF_LAST) {
  2453. unsigned NumElts = parti - VecIdx + 1;
  2454. EVT EltVT = VTs[parti];
  2455. // i1 is loaded/stored as i8.
  2456. EVT LoadVT = EltVT;
  2457. if (EltVT == MVT::i1)
  2458. LoadVT = MVT::i8;
  2459. else if (EltVT == MVT::v2f16)
  2460. // getLoad needs a vector type, but it can't handle
  2461. // vectors which contain v2f16 elements. So we must load
  2462. // using i32 here and then bitcast back.
  2463. LoadVT = MVT::i32;
  2464. EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
  2465. SDValue VecAddr =
  2466. DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
  2467. DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
  2468. Value *srcValue = Constant::getNullValue(PointerType::get(
  2469. EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
  2470. SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
  2471. MachinePointerInfo(srcValue),
  2472. MaybeAlign(aggregateIsPacked ? 1 : 0),
  2473. MachineMemOperand::MODereferenceable |
  2474. MachineMemOperand::MOInvariant);
  2475. if (P.getNode())
  2476. P.getNode()->setIROrder(idx + 1);
  2477. for (unsigned j = 0; j < NumElts; ++j) {
  2478. SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
  2479. DAG.getIntPtrConstant(j, dl));
  2480. // We've loaded i1 as an i8 and now must truncate it back to i1
  2481. if (EltVT == MVT::i1)
  2482. Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
  2483. // v2f16 was loaded as an i32. Now we must bitcast it back.
  2484. else if (EltVT == MVT::v2f16)
  2485. Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
  2486. // If a promoted integer type is used, truncate down to the original
  2487. MVT PromotedVT;
  2488. if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
  2489. Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
  2490. }
  2491. // Extend the element if necessary (e.g. an i8 is loaded
  2492. // into an i16 register)
  2493. if (Ins[InsIdx].VT.isInteger() &&
  2494. Ins[InsIdx].VT.getFixedSizeInBits() >
  2495. LoadVT.getFixedSizeInBits()) {
  2496. unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
  2497. : ISD::ZERO_EXTEND;
  2498. Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
  2499. }
  2500. InVals.push_back(Elt);
  2501. }
  2502. // Reset vector tracking state.
  2503. VecIdx = -1;
  2504. }
  2505. ++InsIdx;
  2506. }
  2507. if (VTs.size() > 0)
  2508. --InsIdx;
  2509. continue;
  2510. }
  2511. // Param has ByVal attribute
  2512. // Return MoveParam(param symbol).
  2513. // Ideally, the param symbol can be returned directly,
  2514. // but when SDNode builder decides to use it in a CopyToReg(),
  2515. // machine instruction fails because TargetExternalSymbol
  2516. // (not lowered) is target dependent, and CopyToReg assumes
  2517. // the source is lowered.
  2518. EVT ObjectVT = getValueType(DL, Ty);
  2519. assert(ObjectVT == Ins[InsIdx].VT &&
  2520. "Ins type did not match function type");
  2521. SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
  2522. SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
  2523. if (p.getNode())
  2524. p.getNode()->setIROrder(idx + 1);
  2525. InVals.push_back(p);
  2526. }
  2527. if (!OutChains.empty())
  2528. DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
  2529. return Chain;
  2530. }
  2531. SDValue
  2532. NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
  2533. bool isVarArg,
  2534. const SmallVectorImpl<ISD::OutputArg> &Outs,
  2535. const SmallVectorImpl<SDValue> &OutVals,
  2536. const SDLoc &dl, SelectionDAG &DAG) const {
  2537. const MachineFunction &MF = DAG.getMachineFunction();
  2538. const Function &F = MF.getFunction();
  2539. Type *RetTy = MF.getFunction().getReturnType();
  2540. bool isABI = (STI.getSmVersion() >= 20);
  2541. assert(isABI && "Non-ABI compilation is not supported");
  2542. if (!isABI)
  2543. return Chain;
  2544. const DataLayout &DL = DAG.getDataLayout();
  2545. SmallVector<SDValue, 16> PromotedOutVals;
  2546. SmallVector<EVT, 16> VTs;
  2547. SmallVector<uint64_t, 16> Offsets;
  2548. ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
  2549. assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
  2550. for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
  2551. SDValue PromotedOutVal = OutVals[i];
  2552. MVT PromotedVT;
  2553. if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) {
  2554. VTs[i] = EVT(PromotedVT);
  2555. }
  2556. if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) {
  2557. llvm::ISD::NodeType Ext =
  2558. Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
  2559. PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal);
  2560. }
  2561. PromotedOutVals.push_back(PromotedOutVal);
  2562. }
  2563. auto VectorInfo = VectorizePTXValueVTs(
  2564. VTs, Offsets,
  2565. RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL)
  2566. : Align(1));
  2567. // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
  2568. // 32-bits are sign extended or zero extended, depending on whether
  2569. // they are signed or unsigned types.
  2570. bool ExtendIntegerRetVal =
  2571. RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
  2572. SmallVector<SDValue, 6> StoreOperands;
  2573. for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
  2574. // New load/store. Record chain and offset operands.
  2575. if (VectorInfo[i] & PVF_FIRST) {
  2576. assert(StoreOperands.empty() && "Orphaned operand list.");
  2577. StoreOperands.push_back(Chain);
  2578. StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
  2579. }
  2580. SDValue OutVal = OutVals[i];
  2581. SDValue RetVal = PromotedOutVals[i];
  2582. if (ExtendIntegerRetVal) {
  2583. RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
  2584. : ISD::ZERO_EXTEND,
  2585. dl, MVT::i32, RetVal);
  2586. } else if (OutVal.getValueSizeInBits() < 16) {
  2587. // Use 16-bit registers for small load-stores as it's the
  2588. // smallest general purpose register size supported by NVPTX.
  2589. RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
  2590. }
  2591. // Record the value to return.
  2592. StoreOperands.push_back(RetVal);
  2593. // That's the last element of this store op.
  2594. if (VectorInfo[i] & PVF_LAST) {
  2595. NVPTXISD::NodeType Op;
  2596. unsigned NumElts = StoreOperands.size() - 2;
  2597. switch (NumElts) {
  2598. case 1:
  2599. Op = NVPTXISD::StoreRetval;
  2600. break;
  2601. case 2:
  2602. Op = NVPTXISD::StoreRetvalV2;
  2603. break;
  2604. case 4:
  2605. Op = NVPTXISD::StoreRetvalV4;
  2606. break;
  2607. default:
  2608. llvm_unreachable("Invalid vector info.");
  2609. }
  2610. // Adjust type of load/store op if we've extended the scalar
  2611. // return value.
  2612. EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
  2613. Chain = DAG.getMemIntrinsicNode(
  2614. Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,
  2615. MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
  2616. // Cleanup vector state.
  2617. StoreOperands.clear();
  2618. }
  2619. }
  2620. return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
  2621. }
  2622. void NVPTXTargetLowering::LowerAsmOperandForConstraint(
  2623. SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
  2624. SelectionDAG &DAG) const {
  2625. if (Constraint.length() > 1)
  2626. return;
  2627. else
  2628. TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
  2629. }
  2630. static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
  2631. switch (Intrinsic) {
  2632. default:
  2633. return 0;
  2634. case Intrinsic::nvvm_tex_1d_v4f32_s32:
  2635. return NVPTXISD::Tex1DFloatS32;
  2636. case Intrinsic::nvvm_tex_1d_v4f32_f32:
  2637. return NVPTXISD::Tex1DFloatFloat;
  2638. case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
  2639. return NVPTXISD::Tex1DFloatFloatLevel;
  2640. case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
  2641. return NVPTXISD::Tex1DFloatFloatGrad;
  2642. case Intrinsic::nvvm_tex_1d_v4s32_s32:
  2643. return NVPTXISD::Tex1DS32S32;
  2644. case Intrinsic::nvvm_tex_1d_v4s32_f32:
  2645. return NVPTXISD::Tex1DS32Float;
  2646. case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
  2647. return NVPTXISD::Tex1DS32FloatLevel;
  2648. case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
  2649. return NVPTXISD::Tex1DS32FloatGrad;
  2650. case Intrinsic::nvvm_tex_1d_v4u32_s32:
  2651. return NVPTXISD::Tex1DU32S32;
  2652. case Intrinsic::nvvm_tex_1d_v4u32_f32:
  2653. return NVPTXISD::Tex1DU32Float;
  2654. case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
  2655. return NVPTXISD::Tex1DU32FloatLevel;
  2656. case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
  2657. return NVPTXISD::Tex1DU32FloatGrad;
  2658. case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
  2659. return NVPTXISD::Tex1DArrayFloatS32;
  2660. case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
  2661. return NVPTXISD::Tex1DArrayFloatFloat;
  2662. case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
  2663. return NVPTXISD::Tex1DArrayFloatFloatLevel;
  2664. case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
  2665. return NVPTXISD::Tex1DArrayFloatFloatGrad;
  2666. case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
  2667. return NVPTXISD::Tex1DArrayS32S32;
  2668. case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
  2669. return NVPTXISD::Tex1DArrayS32Float;
  2670. case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
  2671. return NVPTXISD::Tex1DArrayS32FloatLevel;
  2672. case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
  2673. return NVPTXISD::Tex1DArrayS32FloatGrad;
  2674. case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
  2675. return NVPTXISD::Tex1DArrayU32S32;
  2676. case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
  2677. return NVPTXISD::Tex1DArrayU32Float;
  2678. case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
  2679. return NVPTXISD::Tex1DArrayU32FloatLevel;
  2680. case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
  2681. return NVPTXISD::Tex1DArrayU32FloatGrad;
  2682. case Intrinsic::nvvm_tex_2d_v4f32_s32:
  2683. return NVPTXISD::Tex2DFloatS32;
  2684. case Intrinsic::nvvm_tex_2d_v4f32_f32:
  2685. return NVPTXISD::Tex2DFloatFloat;
  2686. case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
  2687. return NVPTXISD::Tex2DFloatFloatLevel;
  2688. case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
  2689. return NVPTXISD::Tex2DFloatFloatGrad;
  2690. case Intrinsic::nvvm_tex_2d_v4s32_s32:
  2691. return NVPTXISD::Tex2DS32S32;
  2692. case Intrinsic::nvvm_tex_2d_v4s32_f32:
  2693. return NVPTXISD::Tex2DS32Float;
  2694. case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
  2695. return NVPTXISD::Tex2DS32FloatLevel;
  2696. case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
  2697. return NVPTXISD::Tex2DS32FloatGrad;
  2698. case Intrinsic::nvvm_tex_2d_v4u32_s32:
  2699. return NVPTXISD::Tex2DU32S32;
  2700. case Intrinsic::nvvm_tex_2d_v4u32_f32:
  2701. return NVPTXISD::Tex2DU32Float;
  2702. case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
  2703. return NVPTXISD::Tex2DU32FloatLevel;
  2704. case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
  2705. return NVPTXISD::Tex2DU32FloatGrad;
  2706. case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
  2707. return NVPTXISD::Tex2DArrayFloatS32;
  2708. case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
  2709. return NVPTXISD::Tex2DArrayFloatFloat;
  2710. case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
  2711. return NVPTXISD::Tex2DArrayFloatFloatLevel;
  2712. case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
  2713. return NVPTXISD::Tex2DArrayFloatFloatGrad;
  2714. case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
  2715. return NVPTXISD::Tex2DArrayS32S32;
  2716. case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
  2717. return NVPTXISD::Tex2DArrayS32Float;
  2718. case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
  2719. return NVPTXISD::Tex2DArrayS32FloatLevel;
  2720. case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
  2721. return NVPTXISD::Tex2DArrayS32FloatGrad;
  2722. case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
  2723. return NVPTXISD::Tex2DArrayU32S32;
  2724. case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
  2725. return NVPTXISD::Tex2DArrayU32Float;
  2726. case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
  2727. return NVPTXISD::Tex2DArrayU32FloatLevel;
  2728. case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
  2729. return NVPTXISD::Tex2DArrayU32FloatGrad;
  2730. case Intrinsic::nvvm_tex_3d_v4f32_s32:
  2731. return NVPTXISD::Tex3DFloatS32;
  2732. case Intrinsic::nvvm_tex_3d_v4f32_f32:
  2733. return NVPTXISD::Tex3DFloatFloat;
  2734. case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
  2735. return NVPTXISD::Tex3DFloatFloatLevel;
  2736. case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
  2737. return NVPTXISD::Tex3DFloatFloatGrad;
  2738. case Intrinsic::nvvm_tex_3d_v4s32_s32:
  2739. return NVPTXISD::Tex3DS32S32;
  2740. case Intrinsic::nvvm_tex_3d_v4s32_f32:
  2741. return NVPTXISD::Tex3DS32Float;
  2742. case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
  2743. return NVPTXISD::Tex3DS32FloatLevel;
  2744. case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
  2745. return NVPTXISD::Tex3DS32FloatGrad;
  2746. case Intrinsic::nvvm_tex_3d_v4u32_s32:
  2747. return NVPTXISD::Tex3DU32S32;
  2748. case Intrinsic::nvvm_tex_3d_v4u32_f32:
  2749. return NVPTXISD::Tex3DU32Float;
  2750. case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
  2751. return NVPTXISD::Tex3DU32FloatLevel;
  2752. case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
  2753. return NVPTXISD::Tex3DU32FloatGrad;
  2754. case Intrinsic::nvvm_tex_cube_v4f32_f32:
  2755. return NVPTXISD::TexCubeFloatFloat;
  2756. case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
  2757. return NVPTXISD::TexCubeFloatFloatLevel;
  2758. case Intrinsic::nvvm_tex_cube_v4s32_f32:
  2759. return NVPTXISD::TexCubeS32Float;
  2760. case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
  2761. return NVPTXISD::TexCubeS32FloatLevel;
  2762. case Intrinsic::nvvm_tex_cube_v4u32_f32:
  2763. return NVPTXISD::TexCubeU32Float;
  2764. case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
  2765. return NVPTXISD::TexCubeU32FloatLevel;
  2766. case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
  2767. return NVPTXISD::TexCubeArrayFloatFloat;
  2768. case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
  2769. return NVPTXISD::TexCubeArrayFloatFloatLevel;
  2770. case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
  2771. return NVPTXISD::TexCubeArrayS32Float;
  2772. case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
  2773. return NVPTXISD::TexCubeArrayS32FloatLevel;
  2774. case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
  2775. return NVPTXISD::TexCubeArrayU32Float;
  2776. case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
  2777. return NVPTXISD::TexCubeArrayU32FloatLevel;
  2778. case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
  2779. return NVPTXISD::Tld4R2DFloatFloat;
  2780. case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
  2781. return NVPTXISD::Tld4G2DFloatFloat;
  2782. case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
  2783. return NVPTXISD::Tld4B2DFloatFloat;
  2784. case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
  2785. return NVPTXISD::Tld4A2DFloatFloat;
  2786. case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
  2787. return NVPTXISD::Tld4R2DS64Float;
  2788. case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
  2789. return NVPTXISD::Tld4G2DS64Float;
  2790. case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
  2791. return NVPTXISD::Tld4B2DS64Float;
  2792. case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
  2793. return NVPTXISD::Tld4A2DS64Float;
  2794. case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
  2795. return NVPTXISD::Tld4R2DU64Float;
  2796. case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
  2797. return NVPTXISD::Tld4G2DU64Float;
  2798. case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
  2799. return NVPTXISD::Tld4B2DU64Float;
  2800. case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
  2801. return NVPTXISD::Tld4A2DU64Float;
  2802. case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
  2803. return NVPTXISD::TexUnified1DFloatS32;
  2804. case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
  2805. return NVPTXISD::TexUnified1DFloatFloat;
  2806. case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
  2807. return NVPTXISD::TexUnified1DFloatFloatLevel;
  2808. case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
  2809. return NVPTXISD::TexUnified1DFloatFloatGrad;
  2810. case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
  2811. return NVPTXISD::TexUnified1DS32S32;
  2812. case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
  2813. return NVPTXISD::TexUnified1DS32Float;
  2814. case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
  2815. return NVPTXISD::TexUnified1DS32FloatLevel;
  2816. case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
  2817. return NVPTXISD::TexUnified1DS32FloatGrad;
  2818. case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
  2819. return NVPTXISD::TexUnified1DU32S32;
  2820. case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
  2821. return NVPTXISD::TexUnified1DU32Float;
  2822. case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
  2823. return NVPTXISD::TexUnified1DU32FloatLevel;
  2824. case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
  2825. return NVPTXISD::TexUnified1DU32FloatGrad;
  2826. case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
  2827. return NVPTXISD::TexUnified1DArrayFloatS32;
  2828. case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
  2829. return NVPTXISD::TexUnified1DArrayFloatFloat;
  2830. case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
  2831. return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
  2832. case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
  2833. return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
  2834. case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
  2835. return NVPTXISD::TexUnified1DArrayS32S32;
  2836. case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
  2837. return NVPTXISD::TexUnified1DArrayS32Float;
  2838. case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
  2839. return NVPTXISD::TexUnified1DArrayS32FloatLevel;
  2840. case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
  2841. return NVPTXISD::TexUnified1DArrayS32FloatGrad;
  2842. case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
  2843. return NVPTXISD::TexUnified1DArrayU32S32;
  2844. case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
  2845. return NVPTXISD::TexUnified1DArrayU32Float;
  2846. case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
  2847. return NVPTXISD::TexUnified1DArrayU32FloatLevel;
  2848. case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
  2849. return NVPTXISD::TexUnified1DArrayU32FloatGrad;
  2850. case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
  2851. return NVPTXISD::TexUnified2DFloatS32;
  2852. case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
  2853. return NVPTXISD::TexUnified2DFloatFloat;
  2854. case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
  2855. return NVPTXISD::TexUnified2DFloatFloatLevel;
  2856. case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
  2857. return NVPTXISD::TexUnified2DFloatFloatGrad;
  2858. case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
  2859. return NVPTXISD::TexUnified2DS32S32;
  2860. case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
  2861. return NVPTXISD::TexUnified2DS32Float;
  2862. case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
  2863. return NVPTXISD::TexUnified2DS32FloatLevel;
  2864. case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
  2865. return NVPTXISD::TexUnified2DS32FloatGrad;
  2866. case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
  2867. return NVPTXISD::TexUnified2DU32S32;
  2868. case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
  2869. return NVPTXISD::TexUnified2DU32Float;
  2870. case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
  2871. return NVPTXISD::TexUnified2DU32FloatLevel;
  2872. case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
  2873. return NVPTXISD::TexUnified2DU32FloatGrad;
  2874. case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
  2875. return NVPTXISD::TexUnified2DArrayFloatS32;
  2876. case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
  2877. return NVPTXISD::TexUnified2DArrayFloatFloat;
  2878. case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
  2879. return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
  2880. case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
  2881. return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
  2882. case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
  2883. return NVPTXISD::TexUnified2DArrayS32S32;
  2884. case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
  2885. return NVPTXISD::TexUnified2DArrayS32Float;
  2886. case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
  2887. return NVPTXISD::TexUnified2DArrayS32FloatLevel;
  2888. case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
  2889. return NVPTXISD::TexUnified2DArrayS32FloatGrad;
  2890. case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
  2891. return NVPTXISD::TexUnified2DArrayU32S32;
  2892. case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
  2893. return NVPTXISD::TexUnified2DArrayU32Float;
  2894. case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
  2895. return NVPTXISD::TexUnified2DArrayU32FloatLevel;
  2896. case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
  2897. return NVPTXISD::TexUnified2DArrayU32FloatGrad;
  2898. case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
  2899. return NVPTXISD::TexUnified3DFloatS32;
  2900. case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
  2901. return NVPTXISD::TexUnified3DFloatFloat;
  2902. case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
  2903. return NVPTXISD::TexUnified3DFloatFloatLevel;
  2904. case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
  2905. return NVPTXISD::TexUnified3DFloatFloatGrad;
  2906. case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
  2907. return NVPTXISD::TexUnified3DS32S32;
  2908. case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
  2909. return NVPTXISD::TexUnified3DS32Float;
  2910. case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
  2911. return NVPTXISD::TexUnified3DS32FloatLevel;
  2912. case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
  2913. return NVPTXISD::TexUnified3DS32FloatGrad;
  2914. case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
  2915. return NVPTXISD::TexUnified3DU32S32;
  2916. case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
  2917. return NVPTXISD::TexUnified3DU32Float;
  2918. case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
  2919. return NVPTXISD::TexUnified3DU32FloatLevel;
  2920. case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
  2921. return NVPTXISD::TexUnified3DU32FloatGrad;
  2922. case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
  2923. return NVPTXISD::TexUnifiedCubeFloatFloat;
  2924. case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
  2925. return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
  2926. case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
  2927. return NVPTXISD::TexUnifiedCubeS32Float;
  2928. case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
  2929. return NVPTXISD::TexUnifiedCubeS32FloatLevel;
  2930. case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
  2931. return NVPTXISD::TexUnifiedCubeU32Float;
  2932. case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
  2933. return NVPTXISD::TexUnifiedCubeU32FloatLevel;
  2934. case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
  2935. return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
  2936. case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
  2937. return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
  2938. case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
  2939. return NVPTXISD::TexUnifiedCubeArrayS32Float;
  2940. case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
  2941. return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
  2942. case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
  2943. return NVPTXISD::TexUnifiedCubeArrayU32Float;
  2944. case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
  2945. return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
  2946. case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
  2947. return NVPTXISD::Tld4UnifiedR2DFloatFloat;
  2948. case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
  2949. return NVPTXISD::Tld4UnifiedG2DFloatFloat;
  2950. case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
  2951. return NVPTXISD::Tld4UnifiedB2DFloatFloat;
  2952. case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
  2953. return NVPTXISD::Tld4UnifiedA2DFloatFloat;
  2954. case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
  2955. return NVPTXISD::Tld4UnifiedR2DS64Float;
  2956. case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
  2957. return NVPTXISD::Tld4UnifiedG2DS64Float;
  2958. case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
  2959. return NVPTXISD::Tld4UnifiedB2DS64Float;
  2960. case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
  2961. return NVPTXISD::Tld4UnifiedA2DS64Float;
  2962. case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
  2963. return NVPTXISD::Tld4UnifiedR2DU64Float;
  2964. case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
  2965. return NVPTXISD::Tld4UnifiedG2DU64Float;
  2966. case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
  2967. return NVPTXISD::Tld4UnifiedB2DU64Float;
  2968. case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
  2969. return NVPTXISD::Tld4UnifiedA2DU64Float;
  2970. }
  2971. }
  2972. static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
  2973. switch (Intrinsic) {
  2974. default:
  2975. return 0;
  2976. case Intrinsic::nvvm_suld_1d_i8_clamp:
  2977. return NVPTXISD::Suld1DI8Clamp;
  2978. case Intrinsic::nvvm_suld_1d_i16_clamp:
  2979. return NVPTXISD::Suld1DI16Clamp;
  2980. case Intrinsic::nvvm_suld_1d_i32_clamp:
  2981. return NVPTXISD::Suld1DI32Clamp;
  2982. case Intrinsic::nvvm_suld_1d_i64_clamp:
  2983. return NVPTXISD::Suld1DI64Clamp;
  2984. case Intrinsic::nvvm_suld_1d_v2i8_clamp:
  2985. return NVPTXISD::Suld1DV2I8Clamp;
  2986. case Intrinsic::nvvm_suld_1d_v2i16_clamp:
  2987. return NVPTXISD::Suld1DV2I16Clamp;
  2988. case Intrinsic::nvvm_suld_1d_v2i32_clamp:
  2989. return NVPTXISD::Suld1DV2I32Clamp;
  2990. case Intrinsic::nvvm_suld_1d_v2i64_clamp:
  2991. return NVPTXISD::Suld1DV2I64Clamp;
  2992. case Intrinsic::nvvm_suld_1d_v4i8_clamp:
  2993. return NVPTXISD::Suld1DV4I8Clamp;
  2994. case Intrinsic::nvvm_suld_1d_v4i16_clamp:
  2995. return NVPTXISD::Suld1DV4I16Clamp;
  2996. case Intrinsic::nvvm_suld_1d_v4i32_clamp:
  2997. return NVPTXISD::Suld1DV4I32Clamp;
  2998. case Intrinsic::nvvm_suld_1d_array_i8_clamp:
  2999. return NVPTXISD::Suld1DArrayI8Clamp;
  3000. case Intrinsic::nvvm_suld_1d_array_i16_clamp:
  3001. return NVPTXISD::Suld1DArrayI16Clamp;
  3002. case Intrinsic::nvvm_suld_1d_array_i32_clamp:
  3003. return NVPTXISD::Suld1DArrayI32Clamp;
  3004. case Intrinsic::nvvm_suld_1d_array_i64_clamp:
  3005. return NVPTXISD::Suld1DArrayI64Clamp;
  3006. case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
  3007. return NVPTXISD::Suld1DArrayV2I8Clamp;
  3008. case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
  3009. return NVPTXISD::Suld1DArrayV2I16Clamp;
  3010. case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
  3011. return NVPTXISD::Suld1DArrayV2I32Clamp;
  3012. case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
  3013. return NVPTXISD::Suld1DArrayV2I64Clamp;
  3014. case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
  3015. return NVPTXISD::Suld1DArrayV4I8Clamp;
  3016. case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
  3017. return NVPTXISD::Suld1DArrayV4I16Clamp;
  3018. case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
  3019. return NVPTXISD::Suld1DArrayV4I32Clamp;
  3020. case Intrinsic::nvvm_suld_2d_i8_clamp:
  3021. return NVPTXISD::Suld2DI8Clamp;
  3022. case Intrinsic::nvvm_suld_2d_i16_clamp:
  3023. return NVPTXISD::Suld2DI16Clamp;
  3024. case Intrinsic::nvvm_suld_2d_i32_clamp:
  3025. return NVPTXISD::Suld2DI32Clamp;
  3026. case Intrinsic::nvvm_suld_2d_i64_clamp:
  3027. return NVPTXISD::Suld2DI64Clamp;
  3028. case Intrinsic::nvvm_suld_2d_v2i8_clamp:
  3029. return NVPTXISD::Suld2DV2I8Clamp;
  3030. case Intrinsic::nvvm_suld_2d_v2i16_clamp:
  3031. return NVPTXISD::Suld2DV2I16Clamp;
  3032. case Intrinsic::nvvm_suld_2d_v2i32_clamp:
  3033. return NVPTXISD::Suld2DV2I32Clamp;
  3034. case Intrinsic::nvvm_suld_2d_v2i64_clamp:
  3035. return NVPTXISD::Suld2DV2I64Clamp;
  3036. case Intrinsic::nvvm_suld_2d_v4i8_clamp:
  3037. return NVPTXISD::Suld2DV4I8Clamp;
  3038. case Intrinsic::nvvm_suld_2d_v4i16_clamp:
  3039. return NVPTXISD::Suld2DV4I16Clamp;
  3040. case Intrinsic::nvvm_suld_2d_v4i32_clamp:
  3041. return NVPTXISD::Suld2DV4I32Clamp;
  3042. case Intrinsic::nvvm_suld_2d_array_i8_clamp:
  3043. return NVPTXISD::Suld2DArrayI8Clamp;
  3044. case Intrinsic::nvvm_suld_2d_array_i16_clamp:
  3045. return NVPTXISD::Suld2DArrayI16Clamp;
  3046. case Intrinsic::nvvm_suld_2d_array_i32_clamp:
  3047. return NVPTXISD::Suld2DArrayI32Clamp;
  3048. case Intrinsic::nvvm_suld_2d_array_i64_clamp:
  3049. return NVPTXISD::Suld2DArrayI64Clamp;
  3050. case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
  3051. return NVPTXISD::Suld2DArrayV2I8Clamp;
  3052. case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
  3053. return NVPTXISD::Suld2DArrayV2I16Clamp;
  3054. case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
  3055. return NVPTXISD::Suld2DArrayV2I32Clamp;
  3056. case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
  3057. return NVPTXISD::Suld2DArrayV2I64Clamp;
  3058. case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
  3059. return NVPTXISD::Suld2DArrayV4I8Clamp;
  3060. case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
  3061. return NVPTXISD::Suld2DArrayV4I16Clamp;
  3062. case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
  3063. return NVPTXISD::Suld2DArrayV4I32Clamp;
  3064. case Intrinsic::nvvm_suld_3d_i8_clamp:
  3065. return NVPTXISD::Suld3DI8Clamp;
  3066. case Intrinsic::nvvm_suld_3d_i16_clamp:
  3067. return NVPTXISD::Suld3DI16Clamp;
  3068. case Intrinsic::nvvm_suld_3d_i32_clamp:
  3069. return NVPTXISD::Suld3DI32Clamp;
  3070. case Intrinsic::nvvm_suld_3d_i64_clamp:
  3071. return NVPTXISD::Suld3DI64Clamp;
  3072. case Intrinsic::nvvm_suld_3d_v2i8_clamp:
  3073. return NVPTXISD::Suld3DV2I8Clamp;
  3074. case Intrinsic::nvvm_suld_3d_v2i16_clamp:
  3075. return NVPTXISD::Suld3DV2I16Clamp;
  3076. case Intrinsic::nvvm_suld_3d_v2i32_clamp:
  3077. return NVPTXISD::Suld3DV2I32Clamp;
  3078. case Intrinsic::nvvm_suld_3d_v2i64_clamp:
  3079. return NVPTXISD::Suld3DV2I64Clamp;
  3080. case Intrinsic::nvvm_suld_3d_v4i8_clamp:
  3081. return NVPTXISD::Suld3DV4I8Clamp;
  3082. case Intrinsic::nvvm_suld_3d_v4i16_clamp:
  3083. return NVPTXISD::Suld3DV4I16Clamp;
  3084. case Intrinsic::nvvm_suld_3d_v4i32_clamp:
  3085. return NVPTXISD::Suld3DV4I32Clamp;
  3086. case Intrinsic::nvvm_suld_1d_i8_trap:
  3087. return NVPTXISD::Suld1DI8Trap;
  3088. case Intrinsic::nvvm_suld_1d_i16_trap:
  3089. return NVPTXISD::Suld1DI16Trap;
  3090. case Intrinsic::nvvm_suld_1d_i32_trap:
  3091. return NVPTXISD::Suld1DI32Trap;
  3092. case Intrinsic::nvvm_suld_1d_i64_trap:
  3093. return NVPTXISD::Suld1DI64Trap;
  3094. case Intrinsic::nvvm_suld_1d_v2i8_trap:
  3095. return NVPTXISD::Suld1DV2I8Trap;
  3096. case Intrinsic::nvvm_suld_1d_v2i16_trap:
  3097. return NVPTXISD::Suld1DV2I16Trap;
  3098. case Intrinsic::nvvm_suld_1d_v2i32_trap:
  3099. return NVPTXISD::Suld1DV2I32Trap;
  3100. case Intrinsic::nvvm_suld_1d_v2i64_trap:
  3101. return NVPTXISD::Suld1DV2I64Trap;
  3102. case Intrinsic::nvvm_suld_1d_v4i8_trap:
  3103. return NVPTXISD::Suld1DV4I8Trap;
  3104. case Intrinsic::nvvm_suld_1d_v4i16_trap:
  3105. return NVPTXISD::Suld1DV4I16Trap;
  3106. case Intrinsic::nvvm_suld_1d_v4i32_trap:
  3107. return NVPTXISD::Suld1DV4I32Trap;
  3108. case Intrinsic::nvvm_suld_1d_array_i8_trap:
  3109. return NVPTXISD::Suld1DArrayI8Trap;
  3110. case Intrinsic::nvvm_suld_1d_array_i16_trap:
  3111. return NVPTXISD::Suld1DArrayI16Trap;
  3112. case Intrinsic::nvvm_suld_1d_array_i32_trap:
  3113. return NVPTXISD::Suld1DArrayI32Trap;
  3114. case Intrinsic::nvvm_suld_1d_array_i64_trap:
  3115. return NVPTXISD::Suld1DArrayI64Trap;
  3116. case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
  3117. return NVPTXISD::Suld1DArrayV2I8Trap;
  3118. case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
  3119. return NVPTXISD::Suld1DArrayV2I16Trap;
  3120. case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
  3121. return NVPTXISD::Suld1DArrayV2I32Trap;
  3122. case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
  3123. return NVPTXISD::Suld1DArrayV2I64Trap;
  3124. case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
  3125. return NVPTXISD::Suld1DArrayV4I8Trap;
  3126. case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
  3127. return NVPTXISD::Suld1DArrayV4I16Trap;
  3128. case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
  3129. return NVPTXISD::Suld1DArrayV4I32Trap;
  3130. case Intrinsic::nvvm_suld_2d_i8_trap:
  3131. return NVPTXISD::Suld2DI8Trap;
  3132. case Intrinsic::nvvm_suld_2d_i16_trap:
  3133. return NVPTXISD::Suld2DI16Trap;
  3134. case Intrinsic::nvvm_suld_2d_i32_trap:
  3135. return NVPTXISD::Suld2DI32Trap;
  3136. case Intrinsic::nvvm_suld_2d_i64_trap:
  3137. return NVPTXISD::Suld2DI64Trap;
  3138. case Intrinsic::nvvm_suld_2d_v2i8_trap:
  3139. return NVPTXISD::Suld2DV2I8Trap;
  3140. case Intrinsic::nvvm_suld_2d_v2i16_trap:
  3141. return NVPTXISD::Suld2DV2I16Trap;
  3142. case Intrinsic::nvvm_suld_2d_v2i32_trap:
  3143. return NVPTXISD::Suld2DV2I32Trap;
  3144. case Intrinsic::nvvm_suld_2d_v2i64_trap:
  3145. return NVPTXISD::Suld2DV2I64Trap;
  3146. case Intrinsic::nvvm_suld_2d_v4i8_trap:
  3147. return NVPTXISD::Suld2DV4I8Trap;
  3148. case Intrinsic::nvvm_suld_2d_v4i16_trap:
  3149. return NVPTXISD::Suld2DV4I16Trap;
  3150. case Intrinsic::nvvm_suld_2d_v4i32_trap:
  3151. return NVPTXISD::Suld2DV4I32Trap;
  3152. case Intrinsic::nvvm_suld_2d_array_i8_trap:
  3153. return NVPTXISD::Suld2DArrayI8Trap;
  3154. case Intrinsic::nvvm_suld_2d_array_i16_trap:
  3155. return NVPTXISD::Suld2DArrayI16Trap;
  3156. case Intrinsic::nvvm_suld_2d_array_i32_trap:
  3157. return NVPTXISD::Suld2DArrayI32Trap;
  3158. case Intrinsic::nvvm_suld_2d_array_i64_trap:
  3159. return NVPTXISD::Suld2DArrayI64Trap;
  3160. case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
  3161. return NVPTXISD::Suld2DArrayV2I8Trap;
  3162. case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
  3163. return NVPTXISD::Suld2DArrayV2I16Trap;
  3164. case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
  3165. return NVPTXISD::Suld2DArrayV2I32Trap;
  3166. case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
  3167. return NVPTXISD::Suld2DArrayV2I64Trap;
  3168. case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
  3169. return NVPTXISD::Suld2DArrayV4I8Trap;
  3170. case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
  3171. return NVPTXISD::Suld2DArrayV4I16Trap;
  3172. case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
  3173. return NVPTXISD::Suld2DArrayV4I32Trap;
  3174. case Intrinsic::nvvm_suld_3d_i8_trap:
  3175. return NVPTXISD::Suld3DI8Trap;
  3176. case Intrinsic::nvvm_suld_3d_i16_trap:
  3177. return NVPTXISD::Suld3DI16Trap;
  3178. case Intrinsic::nvvm_suld_3d_i32_trap:
  3179. return NVPTXISD::Suld3DI32Trap;
  3180. case Intrinsic::nvvm_suld_3d_i64_trap:
  3181. return NVPTXISD::Suld3DI64Trap;
  3182. case Intrinsic::nvvm_suld_3d_v2i8_trap:
  3183. return NVPTXISD::Suld3DV2I8Trap;
  3184. case Intrinsic::nvvm_suld_3d_v2i16_trap:
  3185. return NVPTXISD::Suld3DV2I16Trap;
  3186. case Intrinsic::nvvm_suld_3d_v2i32_trap:
  3187. return NVPTXISD::Suld3DV2I32Trap;
  3188. case Intrinsic::nvvm_suld_3d_v2i64_trap:
  3189. return NVPTXISD::Suld3DV2I64Trap;
  3190. case Intrinsic::nvvm_suld_3d_v4i8_trap:
  3191. return NVPTXISD::Suld3DV4I8Trap;
  3192. case Intrinsic::nvvm_suld_3d_v4i16_trap:
  3193. return NVPTXISD::Suld3DV4I16Trap;
  3194. case Intrinsic::nvvm_suld_3d_v4i32_trap:
  3195. return NVPTXISD::Suld3DV4I32Trap;
  3196. case Intrinsic::nvvm_suld_1d_i8_zero:
  3197. return NVPTXISD::Suld1DI8Zero;
  3198. case Intrinsic::nvvm_suld_1d_i16_zero:
  3199. return NVPTXISD::Suld1DI16Zero;
  3200. case Intrinsic::nvvm_suld_1d_i32_zero:
  3201. return NVPTXISD::Suld1DI32Zero;
  3202. case Intrinsic::nvvm_suld_1d_i64_zero:
  3203. return NVPTXISD::Suld1DI64Zero;
  3204. case Intrinsic::nvvm_suld_1d_v2i8_zero:
  3205. return NVPTXISD::Suld1DV2I8Zero;
  3206. case Intrinsic::nvvm_suld_1d_v2i16_zero:
  3207. return NVPTXISD::Suld1DV2I16Zero;
  3208. case Intrinsic::nvvm_suld_1d_v2i32_zero:
  3209. return NVPTXISD::Suld1DV2I32Zero;
  3210. case Intrinsic::nvvm_suld_1d_v2i64_zero:
  3211. return NVPTXISD::Suld1DV2I64Zero;
  3212. case Intrinsic::nvvm_suld_1d_v4i8_zero:
  3213. return NVPTXISD::Suld1DV4I8Zero;
  3214. case Intrinsic::nvvm_suld_1d_v4i16_zero:
  3215. return NVPTXISD::Suld1DV4I16Zero;
  3216. case Intrinsic::nvvm_suld_1d_v4i32_zero:
  3217. return NVPTXISD::Suld1DV4I32Zero;
  3218. case Intrinsic::nvvm_suld_1d_array_i8_zero:
  3219. return NVPTXISD::Suld1DArrayI8Zero;
  3220. case Intrinsic::nvvm_suld_1d_array_i16_zero:
  3221. return NVPTXISD::Suld1DArrayI16Zero;
  3222. case Intrinsic::nvvm_suld_1d_array_i32_zero:
  3223. return NVPTXISD::Suld1DArrayI32Zero;
  3224. case Intrinsic::nvvm_suld_1d_array_i64_zero:
  3225. return NVPTXISD::Suld1DArrayI64Zero;
  3226. case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
  3227. return NVPTXISD::Suld1DArrayV2I8Zero;
  3228. case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
  3229. return NVPTXISD::Suld1DArrayV2I16Zero;
  3230. case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
  3231. return NVPTXISD::Suld1DArrayV2I32Zero;
  3232. case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
  3233. return NVPTXISD::Suld1DArrayV2I64Zero;
  3234. case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
  3235. return NVPTXISD::Suld1DArrayV4I8Zero;
  3236. case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
  3237. return NVPTXISD::Suld1DArrayV4I16Zero;
  3238. case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
  3239. return NVPTXISD::Suld1DArrayV4I32Zero;
  3240. case Intrinsic::nvvm_suld_2d_i8_zero:
  3241. return NVPTXISD::Suld2DI8Zero;
  3242. case Intrinsic::nvvm_suld_2d_i16_zero:
  3243. return NVPTXISD::Suld2DI16Zero;
  3244. case Intrinsic::nvvm_suld_2d_i32_zero:
  3245. return NVPTXISD::Suld2DI32Zero;
  3246. case Intrinsic::nvvm_suld_2d_i64_zero:
  3247. return NVPTXISD::Suld2DI64Zero;
  3248. case Intrinsic::nvvm_suld_2d_v2i8_zero:
  3249. return NVPTXISD::Suld2DV2I8Zero;
  3250. case Intrinsic::nvvm_suld_2d_v2i16_zero:
  3251. return NVPTXISD::Suld2DV2I16Zero;
  3252. case Intrinsic::nvvm_suld_2d_v2i32_zero:
  3253. return NVPTXISD::Suld2DV2I32Zero;
  3254. case Intrinsic::nvvm_suld_2d_v2i64_zero:
  3255. return NVPTXISD::Suld2DV2I64Zero;
  3256. case Intrinsic::nvvm_suld_2d_v4i8_zero:
  3257. return NVPTXISD::Suld2DV4I8Zero;
  3258. case Intrinsic::nvvm_suld_2d_v4i16_zero:
  3259. return NVPTXISD::Suld2DV4I16Zero;
  3260. case Intrinsic::nvvm_suld_2d_v4i32_zero:
  3261. return NVPTXISD::Suld2DV4I32Zero;
  3262. case Intrinsic::nvvm_suld_2d_array_i8_zero:
  3263. return NVPTXISD::Suld2DArrayI8Zero;
  3264. case Intrinsic::nvvm_suld_2d_array_i16_zero:
  3265. return NVPTXISD::Suld2DArrayI16Zero;
  3266. case Intrinsic::nvvm_suld_2d_array_i32_zero:
  3267. return NVPTXISD::Suld2DArrayI32Zero;
  3268. case Intrinsic::nvvm_suld_2d_array_i64_zero:
  3269. return NVPTXISD::Suld2DArrayI64Zero;
  3270. case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
  3271. return NVPTXISD::Suld2DArrayV2I8Zero;
  3272. case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
  3273. return NVPTXISD::Suld2DArrayV2I16Zero;
  3274. case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
  3275. return NVPTXISD::Suld2DArrayV2I32Zero;
  3276. case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
  3277. return NVPTXISD::Suld2DArrayV2I64Zero;
  3278. case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
  3279. return NVPTXISD::Suld2DArrayV4I8Zero;
  3280. case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
  3281. return NVPTXISD::Suld2DArrayV4I16Zero;
  3282. case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
  3283. return NVPTXISD::Suld2DArrayV4I32Zero;
  3284. case Intrinsic::nvvm_suld_3d_i8_zero:
  3285. return NVPTXISD::Suld3DI8Zero;
  3286. case Intrinsic::nvvm_suld_3d_i16_zero:
  3287. return NVPTXISD::Suld3DI16Zero;
  3288. case Intrinsic::nvvm_suld_3d_i32_zero:
  3289. return NVPTXISD::Suld3DI32Zero;
  3290. case Intrinsic::nvvm_suld_3d_i64_zero:
  3291. return NVPTXISD::Suld3DI64Zero;
  3292. case Intrinsic::nvvm_suld_3d_v2i8_zero:
  3293. return NVPTXISD::Suld3DV2I8Zero;
  3294. case Intrinsic::nvvm_suld_3d_v2i16_zero:
  3295. return NVPTXISD::Suld3DV2I16Zero;
  3296. case Intrinsic::nvvm_suld_3d_v2i32_zero:
  3297. return NVPTXISD::Suld3DV2I32Zero;
  3298. case Intrinsic::nvvm_suld_3d_v2i64_zero:
  3299. return NVPTXISD::Suld3DV2I64Zero;
  3300. case Intrinsic::nvvm_suld_3d_v4i8_zero:
  3301. return NVPTXISD::Suld3DV4I8Zero;
  3302. case Intrinsic::nvvm_suld_3d_v4i16_zero:
  3303. return NVPTXISD::Suld3DV4I16Zero;
  3304. case Intrinsic::nvvm_suld_3d_v4i32_zero:
  3305. return NVPTXISD::Suld3DV4I32Zero;
  3306. }
  3307. }
  3308. // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
  3309. // TgtMemIntrinsic
  3310. // because we need the information that is only available in the "Value" type
  3311. // of destination
  3312. // pointer. In particular, the address space information.
  3313. bool NVPTXTargetLowering::getTgtMemIntrinsic(
  3314. IntrinsicInfo &Info, const CallInst &I,
  3315. MachineFunction &MF, unsigned Intrinsic) const {
  3316. switch (Intrinsic) {
  3317. default:
  3318. return false;
  3319. case Intrinsic::nvvm_match_all_sync_i32p:
  3320. case Intrinsic::nvvm_match_all_sync_i64p:
  3321. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3322. // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
  3323. // in order to model data exchange with other threads, but perform no real
  3324. // memory accesses.
  3325. Info.memVT = MVT::i1;
  3326. // Our result depends on both our and other thread's arguments.
  3327. Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
  3328. return true;
  3329. case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
  3330. case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
  3331. case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
  3332. case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
  3333. case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
  3334. case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
  3335. case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
  3336. case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
  3337. case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
  3338. case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
  3339. case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
  3340. case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
  3341. case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
  3342. case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
  3343. case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
  3344. case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
  3345. case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
  3346. case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
  3347. case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
  3348. case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
  3349. case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
  3350. case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
  3351. case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
  3352. case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
  3353. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3354. Info.memVT = MVT::v8f16;
  3355. Info.ptrVal = I.getArgOperand(0);
  3356. Info.offset = 0;
  3357. Info.flags = MachineMemOperand::MOLoad;
  3358. Info.align = Align(16);
  3359. return true;
  3360. }
  3361. case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
  3362. case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
  3363. case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
  3364. case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
  3365. case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
  3366. case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
  3367. case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
  3368. case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
  3369. case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
  3370. case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
  3371. case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
  3372. case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
  3373. case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
  3374. case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
  3375. case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
  3376. case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
  3377. case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
  3378. case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
  3379. case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
  3380. case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
  3381. case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
  3382. case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
  3383. case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
  3384. case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
  3385. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3386. Info.memVT = MVT::v2i32;
  3387. Info.ptrVal = I.getArgOperand(0);
  3388. Info.offset = 0;
  3389. Info.flags = MachineMemOperand::MOLoad;
  3390. Info.align = Align(8);
  3391. return true;
  3392. }
  3393. case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
  3394. case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
  3395. case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
  3396. case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
  3397. case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
  3398. case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
  3399. case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
  3400. case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
  3401. case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
  3402. case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
  3403. case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
  3404. case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
  3405. case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
  3406. case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
  3407. case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
  3408. case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
  3409. case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
  3410. case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
  3411. case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
  3412. case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
  3413. case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
  3414. case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
  3415. case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
  3416. case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
  3417. case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
  3418. case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
  3419. case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
  3420. case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
  3421. case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
  3422. case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
  3423. case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
  3424. case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
  3425. case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
  3426. case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {
  3427. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3428. Info.memVT = MVT::v4i32;
  3429. Info.ptrVal = I.getArgOperand(0);
  3430. Info.offset = 0;
  3431. Info.flags = MachineMemOperand::MOLoad;
  3432. Info.align = Align(16);
  3433. return true;
  3434. }
  3435. case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
  3436. case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
  3437. case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
  3438. case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
  3439. case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
  3440. case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
  3441. case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
  3442. case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
  3443. case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
  3444. case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
  3445. case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
  3446. case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
  3447. case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
  3448. case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
  3449. case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
  3450. case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
  3451. case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
  3452. case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
  3453. case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
  3454. case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
  3455. case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
  3456. case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
  3457. case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
  3458. case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
  3459. case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
  3460. case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
  3461. case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
  3462. case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
  3463. case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
  3464. case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {
  3465. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3466. Info.memVT = MVT::i32;
  3467. Info.ptrVal = I.getArgOperand(0);
  3468. Info.offset = 0;
  3469. Info.flags = MachineMemOperand::MOLoad;
  3470. Info.align = Align(4);
  3471. return true;
  3472. }
  3473. case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
  3474. case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
  3475. case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
  3476. case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
  3477. case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
  3478. case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
  3479. case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
  3480. case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
  3481. case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
  3482. case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
  3483. case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
  3484. case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
  3485. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3486. Info.memVT = MVT::v4f16;
  3487. Info.ptrVal = I.getArgOperand(0);
  3488. Info.offset = 0;
  3489. Info.flags = MachineMemOperand::MOLoad;
  3490. Info.align = Align(16);
  3491. return true;
  3492. }
  3493. case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
  3494. case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
  3495. case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
  3496. case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
  3497. case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
  3498. case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
  3499. case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
  3500. case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
  3501. case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
  3502. case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
  3503. case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
  3504. case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
  3505. case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
  3506. case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
  3507. case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
  3508. case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
  3509. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3510. Info.memVT = MVT::v8f32;
  3511. Info.ptrVal = I.getArgOperand(0);
  3512. Info.offset = 0;
  3513. Info.flags = MachineMemOperand::MOLoad;
  3514. Info.align = Align(16);
  3515. return true;
  3516. }
  3517. case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
  3518. case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
  3519. case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
  3520. case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
  3521. case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
  3522. case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
  3523. case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
  3524. case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
  3525. case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
  3526. case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
  3527. case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
  3528. case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
  3529. case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
  3530. case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
  3531. case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
  3532. case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
  3533. case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
  3534. case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
  3535. case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
  3536. case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
  3537. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3538. Info.memVT = MVT::v8i32;
  3539. Info.ptrVal = I.getArgOperand(0);
  3540. Info.offset = 0;
  3541. Info.flags = MachineMemOperand::MOLoad;
  3542. Info.align = Align(16);
  3543. return true;
  3544. }
  3545. case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
  3546. case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
  3547. case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
  3548. case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
  3549. case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
  3550. case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
  3551. case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
  3552. case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
  3553. case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
  3554. case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {
  3555. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3556. Info.memVT = MVT::v2i32;
  3557. Info.ptrVal = I.getArgOperand(0);
  3558. Info.offset = 0;
  3559. Info.flags = MachineMemOperand::MOLoad;
  3560. Info.align = Align(8);
  3561. return true;
  3562. }
  3563. case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
  3564. case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
  3565. case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
  3566. case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
  3567. case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
  3568. case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
  3569. case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
  3570. case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
  3571. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3572. Info.memVT = MVT::f64;
  3573. Info.ptrVal = I.getArgOperand(0);
  3574. Info.offset = 0;
  3575. Info.flags = MachineMemOperand::MOLoad;
  3576. Info.align = Align(8);
  3577. return true;
  3578. }
  3579. case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
  3580. case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
  3581. case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
  3582. case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
  3583. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3584. Info.memVT = MVT::v2f64;
  3585. Info.ptrVal = I.getArgOperand(0);
  3586. Info.offset = 0;
  3587. Info.flags = MachineMemOperand::MOLoad;
  3588. Info.align = Align(16);
  3589. return true;
  3590. }
  3591. case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
  3592. case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
  3593. case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
  3594. case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
  3595. case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
  3596. case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
  3597. case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
  3598. case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
  3599. case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
  3600. case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
  3601. case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
  3602. case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
  3603. Info.opc = ISD::INTRINSIC_VOID;
  3604. Info.memVT = MVT::v4f16;
  3605. Info.ptrVal = I.getArgOperand(0);
  3606. Info.offset = 0;
  3607. Info.flags = MachineMemOperand::MOStore;
  3608. Info.align = Align(16);
  3609. return true;
  3610. }
  3611. case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
  3612. case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
  3613. case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
  3614. case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
  3615. case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
  3616. case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
  3617. case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
  3618. case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
  3619. case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
  3620. case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
  3621. case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
  3622. case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
  3623. case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
  3624. case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
  3625. case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
  3626. case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
  3627. Info.opc = ISD::INTRINSIC_VOID;
  3628. Info.memVT = MVT::v8f32;
  3629. Info.ptrVal = I.getArgOperand(0);
  3630. Info.offset = 0;
  3631. Info.flags = MachineMemOperand::MOStore;
  3632. Info.align = Align(16);
  3633. return true;
  3634. }
  3635. case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
  3636. case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
  3637. case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
  3638. case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
  3639. case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
  3640. case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
  3641. case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
  3642. case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
  3643. case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
  3644. case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
  3645. case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
  3646. case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
  3647. Info.opc = ISD::INTRINSIC_VOID;
  3648. Info.memVT = MVT::v8i32;
  3649. Info.ptrVal = I.getArgOperand(0);
  3650. Info.offset = 0;
  3651. Info.flags = MachineMemOperand::MOStore;
  3652. Info.align = Align(16);
  3653. return true;
  3654. }
  3655. case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
  3656. case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
  3657. case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
  3658. case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
  3659. case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
  3660. case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
  3661. case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
  3662. case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
  3663. Info.opc = ISD::INTRINSIC_VOID;
  3664. Info.memVT = MVT::v2i32;
  3665. Info.ptrVal = I.getArgOperand(0);
  3666. Info.offset = 0;
  3667. Info.flags = MachineMemOperand::MOStore;
  3668. Info.align = Align(8);
  3669. return true;
  3670. }
  3671. case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
  3672. case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
  3673. case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
  3674. case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
  3675. Info.opc = ISD::INTRINSIC_VOID;
  3676. Info.memVT = MVT::v2f64;
  3677. Info.ptrVal = I.getArgOperand(0);
  3678. Info.offset = 0;
  3679. Info.flags = MachineMemOperand::MOStore;
  3680. Info.align = Align(16);
  3681. return true;
  3682. }
  3683. case Intrinsic::nvvm_atomic_load_inc_32:
  3684. case Intrinsic::nvvm_atomic_load_dec_32:
  3685. case Intrinsic::nvvm_atomic_add_gen_f_cta:
  3686. case Intrinsic::nvvm_atomic_add_gen_f_sys:
  3687. case Intrinsic::nvvm_atomic_add_gen_i_cta:
  3688. case Intrinsic::nvvm_atomic_add_gen_i_sys:
  3689. case Intrinsic::nvvm_atomic_and_gen_i_cta:
  3690. case Intrinsic::nvvm_atomic_and_gen_i_sys:
  3691. case Intrinsic::nvvm_atomic_cas_gen_i_cta:
  3692. case Intrinsic::nvvm_atomic_cas_gen_i_sys:
  3693. case Intrinsic::nvvm_atomic_dec_gen_i_cta:
  3694. case Intrinsic::nvvm_atomic_dec_gen_i_sys:
  3695. case Intrinsic::nvvm_atomic_inc_gen_i_cta:
  3696. case Intrinsic::nvvm_atomic_inc_gen_i_sys:
  3697. case Intrinsic::nvvm_atomic_max_gen_i_cta:
  3698. case Intrinsic::nvvm_atomic_max_gen_i_sys:
  3699. case Intrinsic::nvvm_atomic_min_gen_i_cta:
  3700. case Intrinsic::nvvm_atomic_min_gen_i_sys:
  3701. case Intrinsic::nvvm_atomic_or_gen_i_cta:
  3702. case Intrinsic::nvvm_atomic_or_gen_i_sys:
  3703. case Intrinsic::nvvm_atomic_exch_gen_i_cta:
  3704. case Intrinsic::nvvm_atomic_exch_gen_i_sys:
  3705. case Intrinsic::nvvm_atomic_xor_gen_i_cta:
  3706. case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
  3707. auto &DL = I.getModule()->getDataLayout();
  3708. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3709. Info.memVT = getValueType(DL, I.getType());
  3710. Info.ptrVal = I.getArgOperand(0);
  3711. Info.offset = 0;
  3712. Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
  3713. Info.align.reset();
  3714. return true;
  3715. }
  3716. case Intrinsic::nvvm_ldu_global_i:
  3717. case Intrinsic::nvvm_ldu_global_f:
  3718. case Intrinsic::nvvm_ldu_global_p: {
  3719. auto &DL = I.getModule()->getDataLayout();
  3720. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3721. if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
  3722. Info.memVT = getValueType(DL, I.getType());
  3723. else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
  3724. Info.memVT = getPointerTy(DL);
  3725. else
  3726. Info.memVT = getValueType(DL, I.getType());
  3727. Info.ptrVal = I.getArgOperand(0);
  3728. Info.offset = 0;
  3729. Info.flags = MachineMemOperand::MOLoad;
  3730. Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
  3731. return true;
  3732. }
  3733. case Intrinsic::nvvm_ldg_global_i:
  3734. case Intrinsic::nvvm_ldg_global_f:
  3735. case Intrinsic::nvvm_ldg_global_p: {
  3736. auto &DL = I.getModule()->getDataLayout();
  3737. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3738. if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
  3739. Info.memVT = getValueType(DL, I.getType());
  3740. else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
  3741. Info.memVT = getPointerTy(DL);
  3742. else
  3743. Info.memVT = getValueType(DL, I.getType());
  3744. Info.ptrVal = I.getArgOperand(0);
  3745. Info.offset = 0;
  3746. Info.flags = MachineMemOperand::MOLoad;
  3747. Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
  3748. return true;
  3749. }
  3750. case Intrinsic::nvvm_tex_1d_v4f32_s32:
  3751. case Intrinsic::nvvm_tex_1d_v4f32_f32:
  3752. case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
  3753. case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
  3754. case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
  3755. case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
  3756. case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
  3757. case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
  3758. case Intrinsic::nvvm_tex_2d_v4f32_s32:
  3759. case Intrinsic::nvvm_tex_2d_v4f32_f32:
  3760. case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
  3761. case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
  3762. case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
  3763. case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
  3764. case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
  3765. case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
  3766. case Intrinsic::nvvm_tex_3d_v4f32_s32:
  3767. case Intrinsic::nvvm_tex_3d_v4f32_f32:
  3768. case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
  3769. case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
  3770. case Intrinsic::nvvm_tex_cube_v4f32_f32:
  3771. case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
  3772. case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
  3773. case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
  3774. case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
  3775. case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
  3776. case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
  3777. case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
  3778. case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
  3779. case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
  3780. case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
  3781. case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
  3782. case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
  3783. case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
  3784. case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
  3785. case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
  3786. case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
  3787. case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
  3788. case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
  3789. case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
  3790. case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
  3791. case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
  3792. case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
  3793. case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
  3794. case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
  3795. case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
  3796. case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
  3797. case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
  3798. case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
  3799. case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
  3800. case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
  3801. case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
  3802. case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
  3803. case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
  3804. case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
  3805. case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
  3806. Info.opc = getOpcForTextureInstr(Intrinsic);
  3807. Info.memVT = MVT::v4f32;
  3808. Info.ptrVal = nullptr;
  3809. Info.offset = 0;
  3810. Info.flags = MachineMemOperand::MOLoad;
  3811. Info.align = Align(16);
  3812. return true;
  3813. case Intrinsic::nvvm_tex_1d_v4s32_s32:
  3814. case Intrinsic::nvvm_tex_1d_v4s32_f32:
  3815. case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
  3816. case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
  3817. case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
  3818. case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
  3819. case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
  3820. case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
  3821. case Intrinsic::nvvm_tex_2d_v4s32_s32:
  3822. case Intrinsic::nvvm_tex_2d_v4s32_f32:
  3823. case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
  3824. case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
  3825. case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
  3826. case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
  3827. case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
  3828. case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
  3829. case Intrinsic::nvvm_tex_3d_v4s32_s32:
  3830. case Intrinsic::nvvm_tex_3d_v4s32_f32:
  3831. case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
  3832. case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
  3833. case Intrinsic::nvvm_tex_cube_v4s32_f32:
  3834. case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
  3835. case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
  3836. case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
  3837. case Intrinsic::nvvm_tex_cube_v4u32_f32:
  3838. case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
  3839. case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
  3840. case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
  3841. case Intrinsic::nvvm_tex_1d_v4u32_s32:
  3842. case Intrinsic::nvvm_tex_1d_v4u32_f32:
  3843. case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
  3844. case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
  3845. case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
  3846. case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
  3847. case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
  3848. case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
  3849. case Intrinsic::nvvm_tex_2d_v4u32_s32:
  3850. case Intrinsic::nvvm_tex_2d_v4u32_f32:
  3851. case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
  3852. case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
  3853. case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
  3854. case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
  3855. case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
  3856. case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
  3857. case Intrinsic::nvvm_tex_3d_v4u32_s32:
  3858. case Intrinsic::nvvm_tex_3d_v4u32_f32:
  3859. case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
  3860. case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
  3861. case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
  3862. case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
  3863. case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
  3864. case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
  3865. case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
  3866. case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
  3867. case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
  3868. case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
  3869. case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
  3870. case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
  3871. case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
  3872. case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
  3873. case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
  3874. case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
  3875. case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
  3876. case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
  3877. case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
  3878. case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
  3879. case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
  3880. case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
  3881. case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
  3882. case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
  3883. case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
  3884. case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
  3885. case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
  3886. case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
  3887. case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
  3888. case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
  3889. case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
  3890. case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
  3891. case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
  3892. case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
  3893. case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
  3894. case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
  3895. case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
  3896. case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
  3897. case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
  3898. case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
  3899. case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
  3900. case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
  3901. case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
  3902. case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
  3903. case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
  3904. case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
  3905. case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
  3906. case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
  3907. case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
  3908. case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
  3909. case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
  3910. case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
  3911. case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
  3912. case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
  3913. case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
  3914. case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
  3915. case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
  3916. case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
  3917. case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
  3918. case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
  3919. case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
  3920. case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
  3921. case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
  3922. case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
  3923. case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
  3924. case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
  3925. Info.opc = getOpcForTextureInstr(Intrinsic);
  3926. Info.memVT = MVT::v4i32;
  3927. Info.ptrVal = nullptr;
  3928. Info.offset = 0;
  3929. Info.flags = MachineMemOperand::MOLoad;
  3930. Info.align = Align(16);
  3931. return true;
  3932. case Intrinsic::nvvm_suld_1d_i8_clamp:
  3933. case Intrinsic::nvvm_suld_1d_v2i8_clamp:
  3934. case Intrinsic::nvvm_suld_1d_v4i8_clamp:
  3935. case Intrinsic::nvvm_suld_1d_array_i8_clamp:
  3936. case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
  3937. case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
  3938. case Intrinsic::nvvm_suld_2d_i8_clamp:
  3939. case Intrinsic::nvvm_suld_2d_v2i8_clamp:
  3940. case Intrinsic::nvvm_suld_2d_v4i8_clamp:
  3941. case Intrinsic::nvvm_suld_2d_array_i8_clamp:
  3942. case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
  3943. case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
  3944. case Intrinsic::nvvm_suld_3d_i8_clamp:
  3945. case Intrinsic::nvvm_suld_3d_v2i8_clamp:
  3946. case Intrinsic::nvvm_suld_3d_v4i8_clamp:
  3947. case Intrinsic::nvvm_suld_1d_i8_trap:
  3948. case Intrinsic::nvvm_suld_1d_v2i8_trap:
  3949. case Intrinsic::nvvm_suld_1d_v4i8_trap:
  3950. case Intrinsic::nvvm_suld_1d_array_i8_trap:
  3951. case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
  3952. case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
  3953. case Intrinsic::nvvm_suld_2d_i8_trap:
  3954. case Intrinsic::nvvm_suld_2d_v2i8_trap:
  3955. case Intrinsic::nvvm_suld_2d_v4i8_trap:
  3956. case Intrinsic::nvvm_suld_2d_array_i8_trap:
  3957. case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
  3958. case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
  3959. case Intrinsic::nvvm_suld_3d_i8_trap:
  3960. case Intrinsic::nvvm_suld_3d_v2i8_trap:
  3961. case Intrinsic::nvvm_suld_3d_v4i8_trap:
  3962. case Intrinsic::nvvm_suld_1d_i8_zero:
  3963. case Intrinsic::nvvm_suld_1d_v2i8_zero:
  3964. case Intrinsic::nvvm_suld_1d_v4i8_zero:
  3965. case Intrinsic::nvvm_suld_1d_array_i8_zero:
  3966. case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
  3967. case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
  3968. case Intrinsic::nvvm_suld_2d_i8_zero:
  3969. case Intrinsic::nvvm_suld_2d_v2i8_zero:
  3970. case Intrinsic::nvvm_suld_2d_v4i8_zero:
  3971. case Intrinsic::nvvm_suld_2d_array_i8_zero:
  3972. case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
  3973. case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
  3974. case Intrinsic::nvvm_suld_3d_i8_zero:
  3975. case Intrinsic::nvvm_suld_3d_v2i8_zero:
  3976. case Intrinsic::nvvm_suld_3d_v4i8_zero:
  3977. Info.opc = getOpcForSurfaceInstr(Intrinsic);
  3978. Info.memVT = MVT::i8;
  3979. Info.ptrVal = nullptr;
  3980. Info.offset = 0;
  3981. Info.flags = MachineMemOperand::MOLoad;
  3982. Info.align = Align(16);
  3983. return true;
  3984. case Intrinsic::nvvm_suld_1d_i16_clamp:
  3985. case Intrinsic::nvvm_suld_1d_v2i16_clamp:
  3986. case Intrinsic::nvvm_suld_1d_v4i16_clamp:
  3987. case Intrinsic::nvvm_suld_1d_array_i16_clamp:
  3988. case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
  3989. case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
  3990. case Intrinsic::nvvm_suld_2d_i16_clamp:
  3991. case Intrinsic::nvvm_suld_2d_v2i16_clamp:
  3992. case Intrinsic::nvvm_suld_2d_v4i16_clamp:
  3993. case Intrinsic::nvvm_suld_2d_array_i16_clamp:
  3994. case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
  3995. case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
  3996. case Intrinsic::nvvm_suld_3d_i16_clamp:
  3997. case Intrinsic::nvvm_suld_3d_v2i16_clamp:
  3998. case Intrinsic::nvvm_suld_3d_v4i16_clamp:
  3999. case Intrinsic::nvvm_suld_1d_i16_trap:
  4000. case Intrinsic::nvvm_suld_1d_v2i16_trap:
  4001. case Intrinsic::nvvm_suld_1d_v4i16_trap:
  4002. case Intrinsic::nvvm_suld_1d_array_i16_trap:
  4003. case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
  4004. case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
  4005. case Intrinsic::nvvm_suld_2d_i16_trap:
  4006. case Intrinsic::nvvm_suld_2d_v2i16_trap:
  4007. case Intrinsic::nvvm_suld_2d_v4i16_trap:
  4008. case Intrinsic::nvvm_suld_2d_array_i16_trap:
  4009. case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
  4010. case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
  4011. case Intrinsic::nvvm_suld_3d_i16_trap:
  4012. case Intrinsic::nvvm_suld_3d_v2i16_trap:
  4013. case Intrinsic::nvvm_suld_3d_v4i16_trap:
  4014. case Intrinsic::nvvm_suld_1d_i16_zero:
  4015. case Intrinsic::nvvm_suld_1d_v2i16_zero:
  4016. case Intrinsic::nvvm_suld_1d_v4i16_zero:
  4017. case Intrinsic::nvvm_suld_1d_array_i16_zero:
  4018. case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
  4019. case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
  4020. case Intrinsic::nvvm_suld_2d_i16_zero:
  4021. case Intrinsic::nvvm_suld_2d_v2i16_zero:
  4022. case Intrinsic::nvvm_suld_2d_v4i16_zero:
  4023. case Intrinsic::nvvm_suld_2d_array_i16_zero:
  4024. case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
  4025. case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
  4026. case Intrinsic::nvvm_suld_3d_i16_zero:
  4027. case Intrinsic::nvvm_suld_3d_v2i16_zero:
  4028. case Intrinsic::nvvm_suld_3d_v4i16_zero:
  4029. Info.opc = getOpcForSurfaceInstr(Intrinsic);
  4030. Info.memVT = MVT::i16;
  4031. Info.ptrVal = nullptr;
  4032. Info.offset = 0;
  4033. Info.flags = MachineMemOperand::MOLoad;
  4034. Info.align = Align(16);
  4035. return true;
  4036. case Intrinsic::nvvm_suld_1d_i32_clamp:
  4037. case Intrinsic::nvvm_suld_1d_v2i32_clamp:
  4038. case Intrinsic::nvvm_suld_1d_v4i32_clamp:
  4039. case Intrinsic::nvvm_suld_1d_array_i32_clamp:
  4040. case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
  4041. case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
  4042. case Intrinsic::nvvm_suld_2d_i32_clamp:
  4043. case Intrinsic::nvvm_suld_2d_v2i32_clamp:
  4044. case Intrinsic::nvvm_suld_2d_v4i32_clamp:
  4045. case Intrinsic::nvvm_suld_2d_array_i32_clamp:
  4046. case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
  4047. case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
  4048. case Intrinsic::nvvm_suld_3d_i32_clamp:
  4049. case Intrinsic::nvvm_suld_3d_v2i32_clamp:
  4050. case Intrinsic::nvvm_suld_3d_v4i32_clamp:
  4051. case Intrinsic::nvvm_suld_1d_i32_trap:
  4052. case Intrinsic::nvvm_suld_1d_v2i32_trap:
  4053. case Intrinsic::nvvm_suld_1d_v4i32_trap:
  4054. case Intrinsic::nvvm_suld_1d_array_i32_trap:
  4055. case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
  4056. case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
  4057. case Intrinsic::nvvm_suld_2d_i32_trap:
  4058. case Intrinsic::nvvm_suld_2d_v2i32_trap:
  4059. case Intrinsic::nvvm_suld_2d_v4i32_trap:
  4060. case Intrinsic::nvvm_suld_2d_array_i32_trap:
  4061. case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
  4062. case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
  4063. case Intrinsic::nvvm_suld_3d_i32_trap:
  4064. case Intrinsic::nvvm_suld_3d_v2i32_trap:
  4065. case Intrinsic::nvvm_suld_3d_v4i32_trap:
  4066. case Intrinsic::nvvm_suld_1d_i32_zero:
  4067. case Intrinsic::nvvm_suld_1d_v2i32_zero:
  4068. case Intrinsic::nvvm_suld_1d_v4i32_zero:
  4069. case Intrinsic::nvvm_suld_1d_array_i32_zero:
  4070. case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
  4071. case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
  4072. case Intrinsic::nvvm_suld_2d_i32_zero:
  4073. case Intrinsic::nvvm_suld_2d_v2i32_zero:
  4074. case Intrinsic::nvvm_suld_2d_v4i32_zero:
  4075. case Intrinsic::nvvm_suld_2d_array_i32_zero:
  4076. case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
  4077. case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
  4078. case Intrinsic::nvvm_suld_3d_i32_zero:
  4079. case Intrinsic::nvvm_suld_3d_v2i32_zero:
  4080. case Intrinsic::nvvm_suld_3d_v4i32_zero:
  4081. Info.opc = getOpcForSurfaceInstr(Intrinsic);
  4082. Info.memVT = MVT::i32;
  4083. Info.ptrVal = nullptr;
  4084. Info.offset = 0;
  4085. Info.flags = MachineMemOperand::MOLoad;
  4086. Info.align = Align(16);
  4087. return true;
  4088. case Intrinsic::nvvm_suld_1d_i64_clamp:
  4089. case Intrinsic::nvvm_suld_1d_v2i64_clamp:
  4090. case Intrinsic::nvvm_suld_1d_array_i64_clamp:
  4091. case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
  4092. case Intrinsic::nvvm_suld_2d_i64_clamp:
  4093. case Intrinsic::nvvm_suld_2d_v2i64_clamp:
  4094. case Intrinsic::nvvm_suld_2d_array_i64_clamp:
  4095. case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
  4096. case Intrinsic::nvvm_suld_3d_i64_clamp:
  4097. case Intrinsic::nvvm_suld_3d_v2i64_clamp:
  4098. case Intrinsic::nvvm_suld_1d_i64_trap:
  4099. case Intrinsic::nvvm_suld_1d_v2i64_trap:
  4100. case Intrinsic::nvvm_suld_1d_array_i64_trap:
  4101. case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
  4102. case Intrinsic::nvvm_suld_2d_i64_trap:
  4103. case Intrinsic::nvvm_suld_2d_v2i64_trap:
  4104. case Intrinsic::nvvm_suld_2d_array_i64_trap:
  4105. case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
  4106. case Intrinsic::nvvm_suld_3d_i64_trap:
  4107. case Intrinsic::nvvm_suld_3d_v2i64_trap:
  4108. case Intrinsic::nvvm_suld_1d_i64_zero:
  4109. case Intrinsic::nvvm_suld_1d_v2i64_zero:
  4110. case Intrinsic::nvvm_suld_1d_array_i64_zero:
  4111. case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
  4112. case Intrinsic::nvvm_suld_2d_i64_zero:
  4113. case Intrinsic::nvvm_suld_2d_v2i64_zero:
  4114. case Intrinsic::nvvm_suld_2d_array_i64_zero:
  4115. case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
  4116. case Intrinsic::nvvm_suld_3d_i64_zero:
  4117. case Intrinsic::nvvm_suld_3d_v2i64_zero:
  4118. Info.opc = getOpcForSurfaceInstr(Intrinsic);
  4119. Info.memVT = MVT::i64;
  4120. Info.ptrVal = nullptr;
  4121. Info.offset = 0;
  4122. Info.flags = MachineMemOperand::MOLoad;
  4123. Info.align = Align(16);
  4124. return true;
  4125. }
  4126. return false;
  4127. }
  4128. /// getFunctionParamOptimizedAlign - since function arguments are passed via
  4129. /// .param space, we may want to increase their alignment in a way that
  4130. /// ensures that we can effectively vectorize their loads & stores. We can
  4131. /// increase alignment only if the function has internal or has private
  4132. /// linkage as for other linkage types callers may already rely on default
  4133. /// alignment. To allow using 128-bit vectorized loads/stores, this function
  4134. /// ensures that alignment is 16 or greater.
  4135. Align NVPTXTargetLowering::getFunctionParamOptimizedAlign(
  4136. const Function *F, Type *ArgTy, const DataLayout &DL) const {
  4137. const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value();
  4138. // If a function has linkage different from internal or private, we
  4139. // must use default ABI alignment as external users rely on it. Same
  4140. // for a function that may be called from a function pointer.
  4141. if (!F || !F->hasLocalLinkage() ||
  4142. F->hasAddressTaken(/*Users=*/nullptr,
  4143. /*IgnoreCallbackUses=*/false,
  4144. /*IgnoreAssumeLikeCalls=*/true,
  4145. /*IgnoreLLVMUsed=*/true))
  4146. return Align(ABITypeAlign);
  4147. assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
  4148. return Align(std::max(uint64_t(16), ABITypeAlign));
  4149. }
  4150. /// Helper for computing alignment of a device function byval parameter.
  4151. Align NVPTXTargetLowering::getFunctionByValParamAlign(
  4152. const Function *F, Type *ArgTy, Align InitialAlign,
  4153. const DataLayout &DL) const {
  4154. Align ArgAlign = InitialAlign;
  4155. // Try to increase alignment to enhance vectorization options.
  4156. if (F)
  4157. ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL));
  4158. // Work around a bug in ptxas. When PTX code takes address of
  4159. // byval parameter with alignment < 4, ptxas generates code to
  4160. // spill argument into memory. Alas on sm_50+ ptxas generates
  4161. // SASS code that fails with misaligned access. To work around
  4162. // the problem, make sure that we align byval parameters by at
  4163. // least 4.
  4164. // TODO: this will need to be undone when we get to support multi-TU
  4165. // device-side compilation as it breaks ABI compatibility with nvcc.
  4166. // Hopefully ptxas bug is fixed by then.
  4167. ArgAlign = std::max(ArgAlign, Align(4));
  4168. return ArgAlign;
  4169. }
  4170. /// isLegalAddressingMode - Return true if the addressing mode represented
  4171. /// by AM is legal for this target, for a load/store of the specified type.
  4172. /// Used to guide target specific optimizations, like loop strength reduction
  4173. /// (LoopStrengthReduce.cpp) and memory optimization for address mode
  4174. /// (CodeGenPrepare.cpp)
  4175. bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
  4176. const AddrMode &AM, Type *Ty,
  4177. unsigned AS, Instruction *I) const {
  4178. // AddrMode - This represents an addressing mode of:
  4179. // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
  4180. //
  4181. // The legal address modes are
  4182. // - [avar]
  4183. // - [areg]
  4184. // - [areg+immoff]
  4185. // - [immAddr]
  4186. if (AM.BaseGV) {
  4187. return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
  4188. }
  4189. switch (AM.Scale) {
  4190. case 0: // "r", "r+i" or "i" is allowed
  4191. break;
  4192. case 1:
  4193. if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
  4194. return false;
  4195. // Otherwise we have r+i.
  4196. break;
  4197. default:
  4198. // No scale > 1 is allowed
  4199. return false;
  4200. }
  4201. return true;
  4202. }
  4203. //===----------------------------------------------------------------------===//
  4204. // NVPTX Inline Assembly Support
  4205. //===----------------------------------------------------------------------===//
  4206. /// getConstraintType - Given a constraint letter, return the type of
  4207. /// constraint it is for this target.
  4208. NVPTXTargetLowering::ConstraintType
  4209. NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
  4210. if (Constraint.size() == 1) {
  4211. switch (Constraint[0]) {
  4212. default:
  4213. break;
  4214. case 'b':
  4215. case 'r':
  4216. case 'h':
  4217. case 'c':
  4218. case 'l':
  4219. case 'f':
  4220. case 'd':
  4221. case '0':
  4222. case 'N':
  4223. return C_RegisterClass;
  4224. }
  4225. }
  4226. return TargetLowering::getConstraintType(Constraint);
  4227. }
  4228. std::pair<unsigned, const TargetRegisterClass *>
  4229. NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
  4230. StringRef Constraint,
  4231. MVT VT) const {
  4232. if (Constraint.size() == 1) {
  4233. switch (Constraint[0]) {
  4234. case 'b':
  4235. return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
  4236. case 'c':
  4237. return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
  4238. case 'h':
  4239. return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
  4240. case 'r':
  4241. return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
  4242. case 'l':
  4243. case 'N':
  4244. return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
  4245. case 'f':
  4246. return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
  4247. case 'd':
  4248. return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
  4249. }
  4250. }
  4251. return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
  4252. }
  4253. //===----------------------------------------------------------------------===//
  4254. // NVPTX DAG Combining
  4255. //===----------------------------------------------------------------------===//
  4256. bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
  4257. CodeGenOpt::Level OptLevel) const {
  4258. // Always honor command-line argument
  4259. if (FMAContractLevelOpt.getNumOccurrences() > 0)
  4260. return FMAContractLevelOpt > 0;
  4261. // Do not contract if we're not optimizing the code.
  4262. if (OptLevel == 0)
  4263. return false;
  4264. // Honor TargetOptions flags that explicitly say fusion is okay.
  4265. if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast)
  4266. return true;
  4267. return allowUnsafeFPMath(MF);
  4268. }
  4269. bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const {
  4270. // Honor TargetOptions flags that explicitly say unsafe math is okay.
  4271. if (MF.getTarget().Options.UnsafeFPMath)
  4272. return true;
  4273. // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
  4274. const Function &F = MF.getFunction();
  4275. return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
  4276. }
  4277. /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
  4278. /// operands N0 and N1. This is a helper for PerformADDCombine that is
  4279. /// called with the default operands, and if that fails, with commuted
  4280. /// operands.
  4281. static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
  4282. TargetLowering::DAGCombinerInfo &DCI,
  4283. const NVPTXSubtarget &Subtarget,
  4284. CodeGenOpt::Level OptLevel) {
  4285. SelectionDAG &DAG = DCI.DAG;
  4286. // Skip non-integer, non-scalar case
  4287. EVT VT=N0.getValueType();
  4288. if (VT.isVector())
  4289. return SDValue();
  4290. // fold (add (mul a, b), c) -> (mad a, b, c)
  4291. //
  4292. if (N0.getOpcode() == ISD::MUL) {
  4293. assert (VT.isInteger());
  4294. // For integer:
  4295. // Since integer multiply-add costs the same as integer multiply
  4296. // but is more costly than integer add, do the fusion only when
  4297. // the mul is only used in the add.
  4298. if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
  4299. !N0.getNode()->hasOneUse())
  4300. return SDValue();
  4301. // Do the folding
  4302. return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
  4303. N0.getOperand(0), N0.getOperand(1), N1);
  4304. }
  4305. else if (N0.getOpcode() == ISD::FMUL) {
  4306. if (VT == MVT::f32 || VT == MVT::f64) {
  4307. const auto *TLI = static_cast<const NVPTXTargetLowering *>(
  4308. &DAG.getTargetLoweringInfo());
  4309. if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
  4310. return SDValue();
  4311. // For floating point:
  4312. // Do the fusion only when the mul has less than 5 uses and all
  4313. // are add.
  4314. // The heuristic is that if a use is not an add, then that use
  4315. // cannot be fused into fma, therefore mul is still needed anyway.
  4316. // If there are more than 4 uses, even if they are all add, fusing
  4317. // them will increase register pressue.
  4318. //
  4319. int numUses = 0;
  4320. int nonAddCount = 0;
  4321. for (const SDNode *User : N0.getNode()->uses()) {
  4322. numUses++;
  4323. if (User->getOpcode() != ISD::FADD)
  4324. ++nonAddCount;
  4325. }
  4326. if (numUses >= 5)
  4327. return SDValue();
  4328. if (nonAddCount) {
  4329. int orderNo = N->getIROrder();
  4330. int orderNo2 = N0.getNode()->getIROrder();
  4331. // simple heuristics here for considering potential register
  4332. // pressure, the logics here is that the differnce are used
  4333. // to measure the distance between def and use, the longer distance
  4334. // more likely cause register pressure.
  4335. if (orderNo - orderNo2 < 500)
  4336. return SDValue();
  4337. // Now, check if at least one of the FMUL's operands is live beyond the node N,
  4338. // which guarantees that the FMA will not increase register pressure at node N.
  4339. bool opIsLive = false;
  4340. const SDNode *left = N0.getOperand(0).getNode();
  4341. const SDNode *right = N0.getOperand(1).getNode();
  4342. if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
  4343. opIsLive = true;
  4344. if (!opIsLive)
  4345. for (const SDNode *User : left->uses()) {
  4346. int orderNo3 = User->getIROrder();
  4347. if (orderNo3 > orderNo) {
  4348. opIsLive = true;
  4349. break;
  4350. }
  4351. }
  4352. if (!opIsLive)
  4353. for (const SDNode *User : right->uses()) {
  4354. int orderNo3 = User->getIROrder();
  4355. if (orderNo3 > orderNo) {
  4356. opIsLive = true;
  4357. break;
  4358. }
  4359. }
  4360. if (!opIsLive)
  4361. return SDValue();
  4362. }
  4363. return DAG.getNode(ISD::FMA, SDLoc(N), VT,
  4364. N0.getOperand(0), N0.getOperand(1), N1);
  4365. }
  4366. }
  4367. return SDValue();
  4368. }
  4369. static SDValue PerformStoreRetvalCombine(SDNode *N) {
  4370. // Operands from the 2nd to the last one are the values to be stored
  4371. for (std::size_t I = 2, OpsCount = N->ops().size(); I != OpsCount; ++I)
  4372. if (!N->getOperand(I).isUndef())
  4373. return SDValue();
  4374. // Operand 0 is the previous value in the chain. Cannot return EntryToken
  4375. // as the previous value will become unused and eliminated later.
  4376. return N->getOperand(0);
  4377. }
  4378. /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
  4379. ///
  4380. static SDValue PerformADDCombine(SDNode *N,
  4381. TargetLowering::DAGCombinerInfo &DCI,
  4382. const NVPTXSubtarget &Subtarget,
  4383. CodeGenOpt::Level OptLevel) {
  4384. SDValue N0 = N->getOperand(0);
  4385. SDValue N1 = N->getOperand(1);
  4386. // First try with the default operand order.
  4387. if (SDValue Result =
  4388. PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
  4389. return Result;
  4390. // If that didn't work, try again with the operands commuted.
  4391. return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
  4392. }
  4393. static SDValue PerformANDCombine(SDNode *N,
  4394. TargetLowering::DAGCombinerInfo &DCI) {
  4395. // The type legalizer turns a vector load of i8 values into a zextload to i16
  4396. // registers, optionally ANY_EXTENDs it (if target type is integer),
  4397. // and ANDs off the high 8 bits. Since we turn this load into a
  4398. // target-specific DAG node, the DAG combiner fails to eliminate these AND
  4399. // nodes. Do that here.
  4400. SDValue Val = N->getOperand(0);
  4401. SDValue Mask = N->getOperand(1);
  4402. if (isa<ConstantSDNode>(Val)) {
  4403. std::swap(Val, Mask);
  4404. }
  4405. SDValue AExt;
  4406. // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
  4407. if (Val.getOpcode() == ISD::ANY_EXTEND) {
  4408. AExt = Val;
  4409. Val = Val->getOperand(0);
  4410. }
  4411. if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
  4412. Val = Val->getOperand(0);
  4413. }
  4414. if (Val->getOpcode() == NVPTXISD::LoadV2 ||
  4415. Val->getOpcode() == NVPTXISD::LoadV4) {
  4416. ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
  4417. if (!MaskCnst) {
  4418. // Not an AND with a constant
  4419. return SDValue();
  4420. }
  4421. uint64_t MaskVal = MaskCnst->getZExtValue();
  4422. if (MaskVal != 0xff) {
  4423. // Not an AND that chops off top 8 bits
  4424. return SDValue();
  4425. }
  4426. MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
  4427. if (!Mem) {
  4428. // Not a MemSDNode?!?
  4429. return SDValue();
  4430. }
  4431. EVT MemVT = Mem->getMemoryVT();
  4432. if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
  4433. // We only handle the i8 case
  4434. return SDValue();
  4435. }
  4436. unsigned ExtType =
  4437. cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
  4438. getZExtValue();
  4439. if (ExtType == ISD::SEXTLOAD) {
  4440. // If for some reason the load is a sextload, the and is needed to zero
  4441. // out the high 8 bits
  4442. return SDValue();
  4443. }
  4444. bool AddTo = false;
  4445. if (AExt.getNode() != nullptr) {
  4446. // Re-insert the ext as a zext.
  4447. Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
  4448. AExt.getValueType(), Val);
  4449. AddTo = true;
  4450. }
  4451. // If we get here, the AND is unnecessary. Just replace it with the load
  4452. DCI.CombineTo(N, Val, AddTo);
  4453. }
  4454. return SDValue();
  4455. }
  4456. static SDValue PerformREMCombine(SDNode *N,
  4457. TargetLowering::DAGCombinerInfo &DCI,
  4458. CodeGenOpt::Level OptLevel) {
  4459. assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
  4460. // Don't do anything at less than -O2.
  4461. if (OptLevel < CodeGenOpt::Default)
  4462. return SDValue();
  4463. SelectionDAG &DAG = DCI.DAG;
  4464. SDLoc DL(N);
  4465. EVT VT = N->getValueType(0);
  4466. bool IsSigned = N->getOpcode() == ISD::SREM;
  4467. unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
  4468. const SDValue &Num = N->getOperand(0);
  4469. const SDValue &Den = N->getOperand(1);
  4470. for (const SDNode *U : Num->uses()) {
  4471. if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
  4472. U->getOperand(1) == Den) {
  4473. // Num % Den -> Num - (Num / Den) * Den
  4474. return DAG.getNode(ISD::SUB, DL, VT, Num,
  4475. DAG.getNode(ISD::MUL, DL, VT,
  4476. DAG.getNode(DivOpc, DL, VT, Num, Den),
  4477. Den));
  4478. }
  4479. }
  4480. return SDValue();
  4481. }
  4482. enum OperandSignedness {
  4483. Signed = 0,
  4484. Unsigned,
  4485. Unknown
  4486. };
  4487. /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
  4488. /// that can be demoted to \p OptSize bits without loss of information. The
  4489. /// signedness of the operand, if determinable, is placed in \p S.
  4490. static bool IsMulWideOperandDemotable(SDValue Op,
  4491. unsigned OptSize,
  4492. OperandSignedness &S) {
  4493. S = Unknown;
  4494. if (Op.getOpcode() == ISD::SIGN_EXTEND ||
  4495. Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
  4496. EVT OrigVT = Op.getOperand(0).getValueType();
  4497. if (OrigVT.getFixedSizeInBits() <= OptSize) {
  4498. S = Signed;
  4499. return true;
  4500. }
  4501. } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
  4502. EVT OrigVT = Op.getOperand(0).getValueType();
  4503. if (OrigVT.getFixedSizeInBits() <= OptSize) {
  4504. S = Unsigned;
  4505. return true;
  4506. }
  4507. }
  4508. return false;
  4509. }
  4510. /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
  4511. /// be demoted to \p OptSize bits without loss of information. If the operands
  4512. /// contain a constant, it should appear as the RHS operand. The signedness of
  4513. /// the operands is placed in \p IsSigned.
  4514. static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
  4515. unsigned OptSize,
  4516. bool &IsSigned) {
  4517. OperandSignedness LHSSign;
  4518. // The LHS operand must be a demotable op
  4519. if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
  4520. return false;
  4521. // We should have been able to determine the signedness from the LHS
  4522. if (LHSSign == Unknown)
  4523. return false;
  4524. IsSigned = (LHSSign == Signed);
  4525. // The RHS can be a demotable op or a constant
  4526. if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
  4527. const APInt &Val = CI->getAPIntValue();
  4528. if (LHSSign == Unsigned) {
  4529. return Val.isIntN(OptSize);
  4530. } else {
  4531. return Val.isSignedIntN(OptSize);
  4532. }
  4533. } else {
  4534. OperandSignedness RHSSign;
  4535. if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
  4536. return false;
  4537. return LHSSign == RHSSign;
  4538. }
  4539. }
  4540. /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
  4541. /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
  4542. /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
  4543. /// amount.
  4544. static SDValue TryMULWIDECombine(SDNode *N,
  4545. TargetLowering::DAGCombinerInfo &DCI) {
  4546. EVT MulType = N->getValueType(0);
  4547. if (MulType != MVT::i32 && MulType != MVT::i64) {
  4548. return SDValue();
  4549. }
  4550. SDLoc DL(N);
  4551. unsigned OptSize = MulType.getSizeInBits() >> 1;
  4552. SDValue LHS = N->getOperand(0);
  4553. SDValue RHS = N->getOperand(1);
  4554. // Canonicalize the multiply so the constant (if any) is on the right
  4555. if (N->getOpcode() == ISD::MUL) {
  4556. if (isa<ConstantSDNode>(LHS)) {
  4557. std::swap(LHS, RHS);
  4558. }
  4559. }
  4560. // If we have a SHL, determine the actual multiply amount
  4561. if (N->getOpcode() == ISD::SHL) {
  4562. ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
  4563. if (!ShlRHS) {
  4564. return SDValue();
  4565. }
  4566. APInt ShiftAmt = ShlRHS->getAPIntValue();
  4567. unsigned BitWidth = MulType.getSizeInBits();
  4568. if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
  4569. APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
  4570. RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
  4571. } else {
  4572. return SDValue();
  4573. }
  4574. }
  4575. bool Signed;
  4576. // Verify that our operands are demotable
  4577. if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
  4578. return SDValue();
  4579. }
  4580. EVT DemotedVT;
  4581. if (MulType == MVT::i32) {
  4582. DemotedVT = MVT::i16;
  4583. } else {
  4584. DemotedVT = MVT::i32;
  4585. }
  4586. // Truncate the operands to the correct size. Note that these are just for
  4587. // type consistency and will (likely) be eliminated in later phases.
  4588. SDValue TruncLHS =
  4589. DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
  4590. SDValue TruncRHS =
  4591. DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
  4592. unsigned Opc;
  4593. if (Signed) {
  4594. Opc = NVPTXISD::MUL_WIDE_SIGNED;
  4595. } else {
  4596. Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
  4597. }
  4598. return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
  4599. }
  4600. /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
  4601. static SDValue PerformMULCombine(SDNode *N,
  4602. TargetLowering::DAGCombinerInfo &DCI,
  4603. CodeGenOpt::Level OptLevel) {
  4604. if (OptLevel > 0) {
  4605. // Try mul.wide combining at OptLevel > 0
  4606. if (SDValue Ret = TryMULWIDECombine(N, DCI))
  4607. return Ret;
  4608. }
  4609. return SDValue();
  4610. }
  4611. /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
  4612. static SDValue PerformSHLCombine(SDNode *N,
  4613. TargetLowering::DAGCombinerInfo &DCI,
  4614. CodeGenOpt::Level OptLevel) {
  4615. if (OptLevel > 0) {
  4616. // Try mul.wide combining at OptLevel > 0
  4617. if (SDValue Ret = TryMULWIDECombine(N, DCI))
  4618. return Ret;
  4619. }
  4620. return SDValue();
  4621. }
  4622. static SDValue PerformSETCCCombine(SDNode *N,
  4623. TargetLowering::DAGCombinerInfo &DCI) {
  4624. EVT CCType = N->getValueType(0);
  4625. SDValue A = N->getOperand(0);
  4626. SDValue B = N->getOperand(1);
  4627. if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16)
  4628. return SDValue();
  4629. SDLoc DL(N);
  4630. // setp.f16x2 returns two scalar predicates, which we need to
  4631. // convert back to v2i1. The returned result will be scalarized by
  4632. // the legalizer, but the comparison will remain a single vector
  4633. // instruction.
  4634. SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL,
  4635. DCI.DAG.getVTList(MVT::i1, MVT::i1),
  4636. {A, B, N->getOperand(2)});
  4637. return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
  4638. CCNode.getValue(1));
  4639. }
  4640. SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
  4641. DAGCombinerInfo &DCI) const {
  4642. CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel();
  4643. switch (N->getOpcode()) {
  4644. default: break;
  4645. case ISD::ADD:
  4646. case ISD::FADD:
  4647. return PerformADDCombine(N, DCI, STI, OptLevel);
  4648. case ISD::MUL:
  4649. return PerformMULCombine(N, DCI, OptLevel);
  4650. case ISD::SHL:
  4651. return PerformSHLCombine(N, DCI, OptLevel);
  4652. case ISD::AND:
  4653. return PerformANDCombine(N, DCI);
  4654. case ISD::UREM:
  4655. case ISD::SREM:
  4656. return PerformREMCombine(N, DCI, OptLevel);
  4657. case ISD::SETCC:
  4658. return PerformSETCCCombine(N, DCI);
  4659. case NVPTXISD::StoreRetval:
  4660. case NVPTXISD::StoreRetvalV2:
  4661. case NVPTXISD::StoreRetvalV4:
  4662. return PerformStoreRetvalCombine(N);
  4663. }
  4664. return SDValue();
  4665. }
  4666. /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
  4667. static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
  4668. SmallVectorImpl<SDValue> &Results) {
  4669. EVT ResVT = N->getValueType(0);
  4670. SDLoc DL(N);
  4671. assert(ResVT.isVector() && "Vector load must have vector type");
  4672. // We only handle "native" vector sizes for now, e.g. <4 x double> is not
  4673. // legal. We can (and should) split that into 2 loads of <2 x double> here
  4674. // but I'm leaving that as a TODO for now.
  4675. assert(ResVT.isSimple() && "Can only handle simple types");
  4676. switch (ResVT.getSimpleVT().SimpleTy) {
  4677. default:
  4678. return;
  4679. case MVT::v2i8:
  4680. case MVT::v2i16:
  4681. case MVT::v2i32:
  4682. case MVT::v2i64:
  4683. case MVT::v2f16:
  4684. case MVT::v2f32:
  4685. case MVT::v2f64:
  4686. case MVT::v4i8:
  4687. case MVT::v4i16:
  4688. case MVT::v4i32:
  4689. case MVT::v4f16:
  4690. case MVT::v4f32:
  4691. case MVT::v8f16: // <4 x f16x2>
  4692. // This is a "native" vector type
  4693. break;
  4694. }
  4695. LoadSDNode *LD = cast<LoadSDNode>(N);
  4696. Align Alignment = LD->getAlign();
  4697. auto &TD = DAG.getDataLayout();
  4698. Align PrefAlign = TD.getPrefTypeAlign(ResVT.getTypeForEVT(*DAG.getContext()));
  4699. if (Alignment < PrefAlign) {
  4700. // This load is not sufficiently aligned, so bail out and let this vector
  4701. // load be scalarized. Note that we may still be able to emit smaller
  4702. // vector loads. For example, if we are loading a <4 x float> with an
  4703. // alignment of 8, this check will fail but the legalizer will try again
  4704. // with 2 x <2 x float>, which will succeed with an alignment of 8.
  4705. return;
  4706. }
  4707. EVT EltVT = ResVT.getVectorElementType();
  4708. unsigned NumElts = ResVT.getVectorNumElements();
  4709. // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
  4710. // Therefore, we must ensure the type is legal. For i1 and i8, we set the
  4711. // loaded type to i16 and propagate the "real" type as the memory type.
  4712. bool NeedTrunc = false;
  4713. if (EltVT.getSizeInBits() < 16) {
  4714. EltVT = MVT::i16;
  4715. NeedTrunc = true;
  4716. }
  4717. unsigned Opcode = 0;
  4718. SDVTList LdResVTs;
  4719. bool LoadF16x2 = false;
  4720. switch (NumElts) {
  4721. default:
  4722. return;
  4723. case 2:
  4724. Opcode = NVPTXISD::LoadV2;
  4725. LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
  4726. break;
  4727. case 4: {
  4728. Opcode = NVPTXISD::LoadV4;
  4729. EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
  4730. LdResVTs = DAG.getVTList(ListVTs);
  4731. break;
  4732. }
  4733. case 8: {
  4734. // v8f16 is a special case. PTX doesn't have ld.v8.f16
  4735. // instruction. Instead, we split the vector into v2f16 chunks and
  4736. // load them with ld.v4.b32.
  4737. assert((EltVT == MVT::f16 || EltVT == MVT::bf16) &&
  4738. "Unsupported v8 vector type.");
  4739. LoadF16x2 = true;
  4740. Opcode = NVPTXISD::LoadV4;
  4741. EVT VVT = (EltVT == MVT::f16) ? MVT::v2f16 : MVT::v2bf16;
  4742. EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other};
  4743. LdResVTs = DAG.getVTList(ListVTs);
  4744. break;
  4745. }
  4746. }
  4747. // Copy regular operands
  4748. SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
  4749. // The select routine does not have access to the LoadSDNode instance, so
  4750. // pass along the extension information
  4751. OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
  4752. SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
  4753. LD->getMemoryVT(),
  4754. LD->getMemOperand());
  4755. SmallVector<SDValue, 8> ScalarRes;
  4756. if (LoadF16x2) {
  4757. // Split v2f16 subvectors back into individual elements.
  4758. NumElts /= 2;
  4759. for (unsigned i = 0; i < NumElts; ++i) {
  4760. SDValue SubVector = NewLD.getValue(i);
  4761. SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
  4762. DAG.getIntPtrConstant(0, DL));
  4763. SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
  4764. DAG.getIntPtrConstant(1, DL));
  4765. ScalarRes.push_back(E0);
  4766. ScalarRes.push_back(E1);
  4767. }
  4768. } else {
  4769. for (unsigned i = 0; i < NumElts; ++i) {
  4770. SDValue Res = NewLD.getValue(i);
  4771. if (NeedTrunc)
  4772. Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
  4773. ScalarRes.push_back(Res);
  4774. }
  4775. }
  4776. SDValue LoadChain = NewLD.getValue(NumElts);
  4777. SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
  4778. Results.push_back(BuildVec);
  4779. Results.push_back(LoadChain);
  4780. }
  4781. static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
  4782. SmallVectorImpl<SDValue> &Results) {
  4783. SDValue Chain = N->getOperand(0);
  4784. SDValue Intrin = N->getOperand(1);
  4785. SDLoc DL(N);
  4786. // Get the intrinsic ID
  4787. unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
  4788. switch (IntrinNo) {
  4789. default:
  4790. return;
  4791. case Intrinsic::nvvm_ldg_global_i:
  4792. case Intrinsic::nvvm_ldg_global_f:
  4793. case Intrinsic::nvvm_ldg_global_p:
  4794. case Intrinsic::nvvm_ldu_global_i:
  4795. case Intrinsic::nvvm_ldu_global_f:
  4796. case Intrinsic::nvvm_ldu_global_p: {
  4797. EVT ResVT = N->getValueType(0);
  4798. if (ResVT.isVector()) {
  4799. // Vector LDG/LDU
  4800. unsigned NumElts = ResVT.getVectorNumElements();
  4801. EVT EltVT = ResVT.getVectorElementType();
  4802. // Since LDU/LDG are target nodes, we cannot rely on DAG type
  4803. // legalization.
  4804. // Therefore, we must ensure the type is legal. For i1 and i8, we set the
  4805. // loaded type to i16 and propagate the "real" type as the memory type.
  4806. bool NeedTrunc = false;
  4807. if (EltVT.getSizeInBits() < 16) {
  4808. EltVT = MVT::i16;
  4809. NeedTrunc = true;
  4810. }
  4811. unsigned Opcode = 0;
  4812. SDVTList LdResVTs;
  4813. switch (NumElts) {
  4814. default:
  4815. return;
  4816. case 2:
  4817. switch (IntrinNo) {
  4818. default:
  4819. return;
  4820. case Intrinsic::nvvm_ldg_global_i:
  4821. case Intrinsic::nvvm_ldg_global_f:
  4822. case Intrinsic::nvvm_ldg_global_p:
  4823. Opcode = NVPTXISD::LDGV2;
  4824. break;
  4825. case Intrinsic::nvvm_ldu_global_i:
  4826. case Intrinsic::nvvm_ldu_global_f:
  4827. case Intrinsic::nvvm_ldu_global_p:
  4828. Opcode = NVPTXISD::LDUV2;
  4829. break;
  4830. }
  4831. LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
  4832. break;
  4833. case 4: {
  4834. switch (IntrinNo) {
  4835. default:
  4836. return;
  4837. case Intrinsic::nvvm_ldg_global_i:
  4838. case Intrinsic::nvvm_ldg_global_f:
  4839. case Intrinsic::nvvm_ldg_global_p:
  4840. Opcode = NVPTXISD::LDGV4;
  4841. break;
  4842. case Intrinsic::nvvm_ldu_global_i:
  4843. case Intrinsic::nvvm_ldu_global_f:
  4844. case Intrinsic::nvvm_ldu_global_p:
  4845. Opcode = NVPTXISD::LDUV4;
  4846. break;
  4847. }
  4848. EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
  4849. LdResVTs = DAG.getVTList(ListVTs);
  4850. break;
  4851. }
  4852. }
  4853. SmallVector<SDValue, 8> OtherOps;
  4854. // Copy regular operands
  4855. OtherOps.push_back(Chain); // Chain
  4856. // Skip operand 1 (intrinsic ID)
  4857. // Others
  4858. OtherOps.append(N->op_begin() + 2, N->op_end());
  4859. MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
  4860. SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
  4861. MemSD->getMemoryVT(),
  4862. MemSD->getMemOperand());
  4863. SmallVector<SDValue, 4> ScalarRes;
  4864. for (unsigned i = 0; i < NumElts; ++i) {
  4865. SDValue Res = NewLD.getValue(i);
  4866. if (NeedTrunc)
  4867. Res =
  4868. DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
  4869. ScalarRes.push_back(Res);
  4870. }
  4871. SDValue LoadChain = NewLD.getValue(NumElts);
  4872. SDValue BuildVec =
  4873. DAG.getBuildVector(ResVT, DL, ScalarRes);
  4874. Results.push_back(BuildVec);
  4875. Results.push_back(LoadChain);
  4876. } else {
  4877. // i8 LDG/LDU
  4878. assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
  4879. "Custom handling of non-i8 ldu/ldg?");
  4880. // Just copy all operands as-is
  4881. SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
  4882. // Force output to i16
  4883. SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
  4884. MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
  4885. // We make sure the memory type is i8, which will be used during isel
  4886. // to select the proper instruction.
  4887. SDValue NewLD =
  4888. DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
  4889. MVT::i8, MemSD->getMemOperand());
  4890. Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
  4891. NewLD.getValue(0)));
  4892. Results.push_back(NewLD.getValue(1));
  4893. }
  4894. }
  4895. }
  4896. }
  4897. void NVPTXTargetLowering::ReplaceNodeResults(
  4898. SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
  4899. switch (N->getOpcode()) {
  4900. default:
  4901. report_fatal_error("Unhandled custom legalization");
  4902. case ISD::LOAD:
  4903. ReplaceLoadVector(N, DAG, Results);
  4904. return;
  4905. case ISD::INTRINSIC_W_CHAIN:
  4906. ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
  4907. return;
  4908. }
  4909. }
  4910. NVPTXTargetLowering::AtomicExpansionKind
  4911. NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
  4912. Type *Ty = AI->getValOperand()->getType();
  4913. if (AI->isFloatingPointOperation()) {
  4914. if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) {
  4915. if (Ty->isFloatTy())
  4916. return AtomicExpansionKind::None;
  4917. if (Ty->isDoubleTy() && STI.hasAtomAddF64())
  4918. return AtomicExpansionKind::None;
  4919. }
  4920. return AtomicExpansionKind::CmpXChg;
  4921. }
  4922. assert(Ty->isIntegerTy() && "Ty should be integer at this point");
  4923. auto ITy = cast<llvm::IntegerType>(Ty);
  4924. switch (AI->getOperation()) {
  4925. default:
  4926. return AtomicExpansionKind::CmpXChg;
  4927. case AtomicRMWInst::BinOp::And:
  4928. case AtomicRMWInst::BinOp::Or:
  4929. case AtomicRMWInst::BinOp::Xor:
  4930. case AtomicRMWInst::BinOp::Xchg:
  4931. switch (ITy->getBitWidth()) {
  4932. case 8:
  4933. case 16:
  4934. return AtomicExpansionKind::CmpXChg;
  4935. case 32:
  4936. return AtomicExpansionKind::None;
  4937. case 64:
  4938. if (STI.hasAtomBitwise64())
  4939. return AtomicExpansionKind::None;
  4940. return AtomicExpansionKind::CmpXChg;
  4941. default:
  4942. llvm_unreachable("unsupported width encountered");
  4943. }
  4944. case AtomicRMWInst::BinOp::Add:
  4945. case AtomicRMWInst::BinOp::Sub:
  4946. case AtomicRMWInst::BinOp::Max:
  4947. case AtomicRMWInst::BinOp::Min:
  4948. case AtomicRMWInst::BinOp::UMax:
  4949. case AtomicRMWInst::BinOp::UMin:
  4950. switch (ITy->getBitWidth()) {
  4951. case 8:
  4952. case 16:
  4953. return AtomicExpansionKind::CmpXChg;
  4954. case 32:
  4955. return AtomicExpansionKind::None;
  4956. case 64:
  4957. if (STI.hasAtomMinMax64())
  4958. return AtomicExpansionKind::None;
  4959. return AtomicExpansionKind::CmpXChg;
  4960. default:
  4961. llvm_unreachable("unsupported width encountered");
  4962. }
  4963. }
  4964. return AtomicExpansionKind::CmpXChg;
  4965. }
  4966. // Pin NVPTXTargetObjectFile's vtables to this file.
  4967. NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default;
  4968. MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
  4969. const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
  4970. return getDataSection();
  4971. }