NVPTXISelLowering.cpp 203 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139
  1. //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file defines the interfaces that NVPTX uses to lower LLVM code into a
  10. // selection DAG.
  11. //
  12. //===----------------------------------------------------------------------===//
  13. #include "NVPTXISelLowering.h"
  14. #include "MCTargetDesc/NVPTXBaseInfo.h"
  15. #include "NVPTX.h"
  16. #include "NVPTXSubtarget.h"
  17. #include "NVPTXTargetMachine.h"
  18. #include "NVPTXTargetObjectFile.h"
  19. #include "NVPTXUtilities.h"
  20. #include "llvm/ADT/APInt.h"
  21. #include "llvm/ADT/STLExtras.h"
  22. #include "llvm/ADT/SmallVector.h"
  23. #include "llvm/ADT/StringRef.h"
  24. #include "llvm/CodeGen/Analysis.h"
  25. #include "llvm/CodeGen/MachineFunction.h"
  26. #include "llvm/CodeGen/MachineMemOperand.h"
  27. #include "llvm/CodeGen/SelectionDAG.h"
  28. #include "llvm/CodeGen/SelectionDAGNodes.h"
  29. #include "llvm/CodeGen/TargetCallingConv.h"
  30. #include "llvm/CodeGen/TargetLowering.h"
  31. #include "llvm/CodeGen/ValueTypes.h"
  32. #include "llvm/IR/Argument.h"
  33. #include "llvm/IR/Attributes.h"
  34. #include "llvm/IR/Constants.h"
  35. #include "llvm/IR/DataLayout.h"
  36. #include "llvm/IR/DerivedTypes.h"
  37. #include "llvm/IR/Function.h"
  38. #include "llvm/IR/GlobalValue.h"
  39. #include "llvm/IR/Instruction.h"
  40. #include "llvm/IR/Instructions.h"
  41. #include "llvm/IR/IntrinsicsNVPTX.h"
  42. #include "llvm/IR/Module.h"
  43. #include "llvm/IR/Type.h"
  44. #include "llvm/IR/Value.h"
  45. #include "llvm/Support/Casting.h"
  46. #include "llvm/Support/CodeGen.h"
  47. #include "llvm/Support/CommandLine.h"
  48. #include "llvm/Support/ErrorHandling.h"
  49. #include "llvm/Support/MachineValueType.h"
  50. #include "llvm/Support/MathExtras.h"
  51. #include "llvm/Support/raw_ostream.h"
  52. #include "llvm/Target/TargetMachine.h"
  53. #include "llvm/Target/TargetOptions.h"
  54. #include <algorithm>
  55. #include <cassert>
  56. #include <cstdint>
  57. #include <iterator>
  58. #include <sstream>
  59. #include <string>
  60. #include <utility>
  61. #include <vector>
  62. #define DEBUG_TYPE "nvptx-lower"
  63. using namespace llvm;
  64. static std::atomic<unsigned> GlobalUniqueCallSite;
  65. static cl::opt<bool> sched4reg(
  66. "nvptx-sched4reg",
  67. cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
  68. static cl::opt<unsigned>
  69. FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
  70. cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
  71. " 1: do it 2: do it aggressively"),
  72. cl::init(2));
  73. static cl::opt<int> UsePrecDivF32(
  74. "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
  75. cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
  76. " IEEE Compliant F32 div.rnd if available."),
  77. cl::init(2));
  78. static cl::opt<bool> UsePrecSqrtF32(
  79. "nvptx-prec-sqrtf32", cl::Hidden,
  80. cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
  81. cl::init(true));
  82. int NVPTXTargetLowering::getDivF32Level() const {
  83. if (UsePrecDivF32.getNumOccurrences() > 0) {
  84. // If nvptx-prec-div32=N is used on the command-line, always honor it
  85. return UsePrecDivF32;
  86. } else {
  87. // Otherwise, use div.approx if fast math is enabled
  88. if (getTargetMachine().Options.UnsafeFPMath)
  89. return 0;
  90. else
  91. return 2;
  92. }
  93. }
  94. bool NVPTXTargetLowering::usePrecSqrtF32() const {
  95. if (UsePrecSqrtF32.getNumOccurrences() > 0) {
  96. // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
  97. return UsePrecSqrtF32;
  98. } else {
  99. // Otherwise, use sqrt.approx if fast math is enabled
  100. return !getTargetMachine().Options.UnsafeFPMath;
  101. }
  102. }
  103. bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const {
  104. return MF.getDenormalMode(APFloat::IEEEsingle()).Output ==
  105. DenormalMode::PreserveSign;
  106. }
  107. static bool IsPTXVectorType(MVT VT) {
  108. switch (VT.SimpleTy) {
  109. default:
  110. return false;
  111. case MVT::v2i1:
  112. case MVT::v4i1:
  113. case MVT::v2i8:
  114. case MVT::v4i8:
  115. case MVT::v2i16:
  116. case MVT::v4i16:
  117. case MVT::v2i32:
  118. case MVT::v4i32:
  119. case MVT::v2i64:
  120. case MVT::v2f16:
  121. case MVT::v4f16:
  122. case MVT::v8f16: // <4 x f16x2>
  123. case MVT::v2f32:
  124. case MVT::v4f32:
  125. case MVT::v2f64:
  126. return true;
  127. }
  128. }
  129. /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
  130. /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
  131. /// into their primitive components.
  132. /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
  133. /// same number of types as the Ins/Outs arrays in LowerFormalArguments,
  134. /// LowerCall, and LowerReturn.
  135. static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
  136. Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
  137. SmallVectorImpl<uint64_t> *Offsets = nullptr,
  138. uint64_t StartingOffset = 0) {
  139. SmallVector<EVT, 16> TempVTs;
  140. SmallVector<uint64_t, 16> TempOffsets;
  141. // Special case for i128 - decompose to (i64, i64)
  142. if (Ty->isIntegerTy(128)) {
  143. ValueVTs.push_back(EVT(MVT::i64));
  144. ValueVTs.push_back(EVT(MVT::i64));
  145. if (Offsets) {
  146. Offsets->push_back(StartingOffset + 0);
  147. Offsets->push_back(StartingOffset + 8);
  148. }
  149. return;
  150. }
  151. // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
  152. if (StructType *STy = dyn_cast<StructType>(Ty)) {
  153. auto const *SL = DL.getStructLayout(STy);
  154. auto ElementNum = 0;
  155. for(auto *EI : STy->elements()) {
  156. ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
  157. StartingOffset + SL->getElementOffset(ElementNum));
  158. ++ElementNum;
  159. }
  160. return;
  161. }
  162. ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
  163. for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
  164. EVT VT = TempVTs[i];
  165. uint64_t Off = TempOffsets[i];
  166. // Split vectors into individual elements, except for v2f16, which
  167. // we will pass as a single scalar.
  168. if (VT.isVector()) {
  169. unsigned NumElts = VT.getVectorNumElements();
  170. EVT EltVT = VT.getVectorElementType();
  171. // Vectors with an even number of f16 elements will be passed to
  172. // us as an array of v2f16 elements. We must match this so we
  173. // stay in sync with Ins/Outs.
  174. if (EltVT == MVT::f16 && NumElts % 2 == 0) {
  175. EltVT = MVT::v2f16;
  176. NumElts /= 2;
  177. }
  178. for (unsigned j = 0; j != NumElts; ++j) {
  179. ValueVTs.push_back(EltVT);
  180. if (Offsets)
  181. Offsets->push_back(Off + j * EltVT.getStoreSize());
  182. }
  183. } else {
  184. ValueVTs.push_back(VT);
  185. if (Offsets)
  186. Offsets->push_back(Off);
  187. }
  188. }
  189. }
  190. // Check whether we can merge loads/stores of some of the pieces of a
  191. // flattened function parameter or return value into a single vector
  192. // load/store.
  193. //
  194. // The flattened parameter is represented as a list of EVTs and
  195. // offsets, and the whole structure is aligned to ParamAlignment. This
  196. // function determines whether we can load/store pieces of the
  197. // parameter starting at index Idx using a single vectorized op of
  198. // size AccessSize. If so, it returns the number of param pieces
  199. // covered by the vector op. Otherwise, it returns 1.
  200. static unsigned CanMergeParamLoadStoresStartingAt(
  201. unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
  202. const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
  203. // Can't vectorize if param alignment is not sufficient.
  204. if (ParamAlignment < AccessSize)
  205. return 1;
  206. // Can't vectorize if offset is not aligned.
  207. if (Offsets[Idx] & (AccessSize - 1))
  208. return 1;
  209. EVT EltVT = ValueVTs[Idx];
  210. unsigned EltSize = EltVT.getStoreSize();
  211. // Element is too large to vectorize.
  212. if (EltSize >= AccessSize)
  213. return 1;
  214. unsigned NumElts = AccessSize / EltSize;
  215. // Can't vectorize if AccessBytes if not a multiple of EltSize.
  216. if (AccessSize != EltSize * NumElts)
  217. return 1;
  218. // We don't have enough elements to vectorize.
  219. if (Idx + NumElts > ValueVTs.size())
  220. return 1;
  221. // PTX ISA can only deal with 2- and 4-element vector ops.
  222. if (NumElts != 4 && NumElts != 2)
  223. return 1;
  224. for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
  225. // Types do not match.
  226. if (ValueVTs[j] != EltVT)
  227. return 1;
  228. // Elements are not contiguous.
  229. if (Offsets[j] - Offsets[j - 1] != EltSize)
  230. return 1;
  231. }
  232. // OK. We can vectorize ValueVTs[i..i+NumElts)
  233. return NumElts;
  234. }
  235. // Flags for tracking per-element vectorization state of loads/stores
  236. // of a flattened function parameter or return value.
  237. enum ParamVectorizationFlags {
  238. PVF_INNER = 0x0, // Middle elements of a vector.
  239. PVF_FIRST = 0x1, // First element of the vector.
  240. PVF_LAST = 0x2, // Last element of the vector.
  241. // Scalar is effectively a 1-element vector.
  242. PVF_SCALAR = PVF_FIRST | PVF_LAST
  243. };
  244. // Computes whether and how we can vectorize the loads/stores of a
  245. // flattened function parameter or return value.
  246. //
  247. // The flattened parameter is represented as the list of ValueVTs and
  248. // Offsets, and is aligned to ParamAlignment bytes. We return a vector
  249. // of the same size as ValueVTs indicating how each piece should be
  250. // loaded/stored (i.e. as a scalar, or as part of a vector
  251. // load/store).
  252. static SmallVector<ParamVectorizationFlags, 16>
  253. VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
  254. const SmallVectorImpl<uint64_t> &Offsets,
  255. Align ParamAlignment) {
  256. // Set vector size to match ValueVTs and mark all elements as
  257. // scalars by default.
  258. SmallVector<ParamVectorizationFlags, 16> VectorInfo;
  259. VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
  260. // Check what we can vectorize using 128/64/32-bit accesses.
  261. for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
  262. // Skip elements we've already processed.
  263. assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
  264. for (unsigned AccessSize : {16, 8, 4, 2}) {
  265. unsigned NumElts = CanMergeParamLoadStoresStartingAt(
  266. I, AccessSize, ValueVTs, Offsets, ParamAlignment);
  267. // Mark vectorized elements.
  268. switch (NumElts) {
  269. default:
  270. llvm_unreachable("Unexpected return value");
  271. case 1:
  272. // Can't vectorize using this size, try next smaller size.
  273. continue;
  274. case 2:
  275. assert(I + 1 < E && "Not enough elements.");
  276. VectorInfo[I] = PVF_FIRST;
  277. VectorInfo[I + 1] = PVF_LAST;
  278. I += 1;
  279. break;
  280. case 4:
  281. assert(I + 3 < E && "Not enough elements.");
  282. VectorInfo[I] = PVF_FIRST;
  283. VectorInfo[I + 1] = PVF_INNER;
  284. VectorInfo[I + 2] = PVF_INNER;
  285. VectorInfo[I + 3] = PVF_LAST;
  286. I += 3;
  287. break;
  288. }
  289. // Break out of the inner loop because we've already succeeded
  290. // using largest possible AccessSize.
  291. break;
  292. }
  293. }
  294. return VectorInfo;
  295. }
  296. // NVPTXTargetLowering Constructor.
  297. NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
  298. const NVPTXSubtarget &STI)
  299. : TargetLowering(TM), nvTM(&TM), STI(STI) {
  300. // always lower memset, memcpy, and memmove intrinsics to load/store
  301. // instructions, rather
  302. // then generating calls to memset, mempcy or memmove.
  303. MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
  304. MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
  305. MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
  306. setBooleanContents(ZeroOrNegativeOneBooleanContent);
  307. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
  308. // Jump is Expensive. Don't create extra control flow for 'and', 'or'
  309. // condition branches.
  310. setJumpIsExpensive(true);
  311. // Wide divides are _very_ slow. Try to reduce the width of the divide if
  312. // possible.
  313. addBypassSlowDiv(64, 32);
  314. // By default, use the Source scheduling
  315. if (sched4reg)
  316. setSchedulingPreference(Sched::RegPressure);
  317. else
  318. setSchedulingPreference(Sched::Source);
  319. auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
  320. LegalizeAction NoF16Action) {
  321. setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
  322. };
  323. addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
  324. addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
  325. addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
  326. addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
  327. addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
  328. addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
  329. addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
  330. addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
  331. // Conversion to/from FP16/FP16x2 is always legal.
  332. setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal);
  333. setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal);
  334. setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
  335. setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
  336. setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand);
  337. setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand);
  338. setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
  339. setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
  340. // Operations not directly supported by NVPTX.
  341. for (MVT VT : {MVT::f16, MVT::v2f16, MVT::f32, MVT::f64, MVT::i1, MVT::i8,
  342. MVT::i16, MVT::i32, MVT::i64}) {
  343. setOperationAction(ISD::SELECT_CC, VT, Expand);
  344. setOperationAction(ISD::BR_CC, VT, Expand);
  345. }
  346. // Some SIGN_EXTEND_INREG can be done using cvt instruction.
  347. // For others we will expand to a SHL/SRA pair.
  348. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
  349. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
  350. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
  351. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
  352. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
  353. setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom);
  354. setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom);
  355. setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom);
  356. setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom);
  357. setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom);
  358. setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom);
  359. setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
  360. setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
  361. // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs
  362. // that don't have h/w rotation we lower them to multi-instruction assembly.
  363. // See ROT*_sw in NVPTXIntrInfo.td
  364. setOperationAction(ISD::ROTL, MVT::i64, Legal);
  365. setOperationAction(ISD::ROTR, MVT::i64, Legal);
  366. setOperationAction(ISD::ROTL, MVT::i32, Legal);
  367. setOperationAction(ISD::ROTR, MVT::i32, Legal);
  368. setOperationAction(ISD::ROTL, MVT::i16, Expand);
  369. setOperationAction(ISD::ROTR, MVT::i16, Expand);
  370. setOperationAction(ISD::ROTL, MVT::i8, Expand);
  371. setOperationAction(ISD::ROTR, MVT::i8, Expand);
  372. setOperationAction(ISD::BSWAP, MVT::i16, Expand);
  373. setOperationAction(ISD::BSWAP, MVT::i32, Expand);
  374. setOperationAction(ISD::BSWAP, MVT::i64, Expand);
  375. // Indirect branch is not supported.
  376. // This also disables Jump Table creation.
  377. setOperationAction(ISD::BR_JT, MVT::Other, Expand);
  378. setOperationAction(ISD::BRIND, MVT::Other, Expand);
  379. setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
  380. setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
  381. // We want to legalize constant related memmove and memcopy
  382. // intrinsics.
  383. setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
  384. // Turn FP extload into load/fpextend
  385. setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
  386. setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
  387. setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
  388. setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
  389. setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
  390. setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
  391. setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
  392. setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
  393. setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
  394. // Turn FP truncstore into trunc + store.
  395. // FIXME: vector types should also be expanded
  396. setTruncStoreAction(MVT::f32, MVT::f16, Expand);
  397. setTruncStoreAction(MVT::f64, MVT::f16, Expand);
  398. setTruncStoreAction(MVT::f64, MVT::f32, Expand);
  399. // PTX does not support load / store predicate registers
  400. setOperationAction(ISD::LOAD, MVT::i1, Custom);
  401. setOperationAction(ISD::STORE, MVT::i1, Custom);
  402. for (MVT VT : MVT::integer_valuetypes()) {
  403. setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
  404. setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
  405. setTruncStoreAction(VT, MVT::i1, Expand);
  406. }
  407. // This is legal in NVPTX
  408. setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
  409. setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
  410. setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
  411. // TRAP can be lowered to PTX trap
  412. setOperationAction(ISD::TRAP, MVT::Other, Legal);
  413. // Register custom handling for vector loads/stores
  414. for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
  415. if (IsPTXVectorType(VT)) {
  416. setOperationAction(ISD::LOAD, VT, Custom);
  417. setOperationAction(ISD::STORE, VT, Custom);
  418. setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
  419. }
  420. }
  421. // Custom handling for i8 intrinsics
  422. setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
  423. for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
  424. setOperationAction(ISD::ABS, Ty, Legal);
  425. setOperationAction(ISD::SMIN, Ty, Legal);
  426. setOperationAction(ISD::SMAX, Ty, Legal);
  427. setOperationAction(ISD::UMIN, Ty, Legal);
  428. setOperationAction(ISD::UMAX, Ty, Legal);
  429. setOperationAction(ISD::CTPOP, Ty, Legal);
  430. setOperationAction(ISD::CTLZ, Ty, Legal);
  431. }
  432. setOperationAction(ISD::CTTZ, MVT::i16, Expand);
  433. setOperationAction(ISD::CTTZ, MVT::i32, Expand);
  434. setOperationAction(ISD::CTTZ, MVT::i64, Expand);
  435. // PTX does not directly support SELP of i1, so promote to i32 first
  436. setOperationAction(ISD::SELECT, MVT::i1, Custom);
  437. // PTX cannot multiply two i64s in a single instruction.
  438. setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
  439. setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
  440. // We have some custom DAG combine patterns for these nodes
  441. setTargetDAGCombine(ISD::ADD);
  442. setTargetDAGCombine(ISD::AND);
  443. setTargetDAGCombine(ISD::FADD);
  444. setTargetDAGCombine(ISD::MUL);
  445. setTargetDAGCombine(ISD::SHL);
  446. setTargetDAGCombine(ISD::SREM);
  447. setTargetDAGCombine(ISD::UREM);
  448. // setcc for f16x2 needs special handling to prevent legalizer's
  449. // attempt to scalarize it due to v2i1 not being legal.
  450. if (STI.allowFP16Math())
  451. setTargetDAGCombine(ISD::SETCC);
  452. // Promote fp16 arithmetic if fp16 hardware isn't available or the
  453. // user passed --nvptx-no-fp16-math. The flag is useful because,
  454. // although sm_53+ GPUs have some sort of FP16 support in
  455. // hardware, only sm_53 and sm_60 have full implementation. Others
  456. // only have token amount of hardware and are likely to run faster
  457. // by using fp32 units instead.
  458. for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
  459. setFP16OperationAction(Op, MVT::f16, Legal, Promote);
  460. setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
  461. }
  462. // There's no neg.f16 instruction. Expand to (0-x).
  463. setOperationAction(ISD::FNEG, MVT::f16, Expand);
  464. setOperationAction(ISD::FNEG, MVT::v2f16, Expand);
  465. // (would be) Library functions.
  466. // These map to conversion instructions for scalar FP types.
  467. for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
  468. ISD::FTRUNC}) {
  469. setOperationAction(Op, MVT::f16, Legal);
  470. setOperationAction(Op, MVT::f32, Legal);
  471. setOperationAction(Op, MVT::f64, Legal);
  472. setOperationAction(Op, MVT::v2f16, Expand);
  473. }
  474. setOperationAction(ISD::FROUND, MVT::f16, Promote);
  475. setOperationAction(ISD::FROUND, MVT::v2f16, Expand);
  476. setOperationAction(ISD::FROUND, MVT::f32, Custom);
  477. setOperationAction(ISD::FROUND, MVT::f64, Custom);
  478. // 'Expand' implements FCOPYSIGN without calling an external library.
  479. setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
  480. setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);
  481. setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
  482. setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
  483. // These map to corresponding instructions for f32/f64. f16 must be
  484. // promoted to f32. v2f16 is expanded to f16, which is then promoted
  485. // to f32.
  486. for (const auto &Op :
  487. {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FABS}) {
  488. setOperationAction(Op, MVT::f16, Promote);
  489. setOperationAction(Op, MVT::f32, Legal);
  490. setOperationAction(Op, MVT::f64, Legal);
  491. setOperationAction(Op, MVT::v2f16, Expand);
  492. }
  493. // max.f16, max.f16x2 and max.NaN are supported on sm_80+.
  494. auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) {
  495. bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
  496. return IsAtLeastSm80 ? Legal : NotSm80Action;
  497. };
  498. for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
  499. setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote);
  500. setOperationAction(Op, MVT::f32, Legal);
  501. setOperationAction(Op, MVT::f64, Legal);
  502. setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
  503. }
  504. for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
  505. setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand);
  506. setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand));
  507. setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
  508. }
  509. // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
  510. // No FPOW or FREM in PTX.
  511. // Now deduce the information based on the above mentioned
  512. // actions
  513. computeRegisterProperties(STI.getRegisterInfo());
  514. }
  515. const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
  516. switch ((NVPTXISD::NodeType)Opcode) {
  517. case NVPTXISD::FIRST_NUMBER:
  518. break;
  519. case NVPTXISD::CALL:
  520. return "NVPTXISD::CALL";
  521. case NVPTXISD::RET_FLAG:
  522. return "NVPTXISD::RET_FLAG";
  523. case NVPTXISD::LOAD_PARAM:
  524. return "NVPTXISD::LOAD_PARAM";
  525. case NVPTXISD::Wrapper:
  526. return "NVPTXISD::Wrapper";
  527. case NVPTXISD::DeclareParam:
  528. return "NVPTXISD::DeclareParam";
  529. case NVPTXISD::DeclareScalarParam:
  530. return "NVPTXISD::DeclareScalarParam";
  531. case NVPTXISD::DeclareRet:
  532. return "NVPTXISD::DeclareRet";
  533. case NVPTXISD::DeclareScalarRet:
  534. return "NVPTXISD::DeclareScalarRet";
  535. case NVPTXISD::DeclareRetParam:
  536. return "NVPTXISD::DeclareRetParam";
  537. case NVPTXISD::PrintCall:
  538. return "NVPTXISD::PrintCall";
  539. case NVPTXISD::PrintConvergentCall:
  540. return "NVPTXISD::PrintConvergentCall";
  541. case NVPTXISD::PrintCallUni:
  542. return "NVPTXISD::PrintCallUni";
  543. case NVPTXISD::PrintConvergentCallUni:
  544. return "NVPTXISD::PrintConvergentCallUni";
  545. case NVPTXISD::LoadParam:
  546. return "NVPTXISD::LoadParam";
  547. case NVPTXISD::LoadParamV2:
  548. return "NVPTXISD::LoadParamV2";
  549. case NVPTXISD::LoadParamV4:
  550. return "NVPTXISD::LoadParamV4";
  551. case NVPTXISD::StoreParam:
  552. return "NVPTXISD::StoreParam";
  553. case NVPTXISD::StoreParamV2:
  554. return "NVPTXISD::StoreParamV2";
  555. case NVPTXISD::StoreParamV4:
  556. return "NVPTXISD::StoreParamV4";
  557. case NVPTXISD::StoreParamS32:
  558. return "NVPTXISD::StoreParamS32";
  559. case NVPTXISD::StoreParamU32:
  560. return "NVPTXISD::StoreParamU32";
  561. case NVPTXISD::CallArgBegin:
  562. return "NVPTXISD::CallArgBegin";
  563. case NVPTXISD::CallArg:
  564. return "NVPTXISD::CallArg";
  565. case NVPTXISD::LastCallArg:
  566. return "NVPTXISD::LastCallArg";
  567. case NVPTXISD::CallArgEnd:
  568. return "NVPTXISD::CallArgEnd";
  569. case NVPTXISD::CallVoid:
  570. return "NVPTXISD::CallVoid";
  571. case NVPTXISD::CallVal:
  572. return "NVPTXISD::CallVal";
  573. case NVPTXISD::CallSymbol:
  574. return "NVPTXISD::CallSymbol";
  575. case NVPTXISD::Prototype:
  576. return "NVPTXISD::Prototype";
  577. case NVPTXISD::MoveParam:
  578. return "NVPTXISD::MoveParam";
  579. case NVPTXISD::StoreRetval:
  580. return "NVPTXISD::StoreRetval";
  581. case NVPTXISD::StoreRetvalV2:
  582. return "NVPTXISD::StoreRetvalV2";
  583. case NVPTXISD::StoreRetvalV4:
  584. return "NVPTXISD::StoreRetvalV4";
  585. case NVPTXISD::PseudoUseParam:
  586. return "NVPTXISD::PseudoUseParam";
  587. case NVPTXISD::RETURN:
  588. return "NVPTXISD::RETURN";
  589. case NVPTXISD::CallSeqBegin:
  590. return "NVPTXISD::CallSeqBegin";
  591. case NVPTXISD::CallSeqEnd:
  592. return "NVPTXISD::CallSeqEnd";
  593. case NVPTXISD::CallPrototype:
  594. return "NVPTXISD::CallPrototype";
  595. case NVPTXISD::ProxyReg:
  596. return "NVPTXISD::ProxyReg";
  597. case NVPTXISD::LoadV2:
  598. return "NVPTXISD::LoadV2";
  599. case NVPTXISD::LoadV4:
  600. return "NVPTXISD::LoadV4";
  601. case NVPTXISD::LDGV2:
  602. return "NVPTXISD::LDGV2";
  603. case NVPTXISD::LDGV4:
  604. return "NVPTXISD::LDGV4";
  605. case NVPTXISD::LDUV2:
  606. return "NVPTXISD::LDUV2";
  607. case NVPTXISD::LDUV4:
  608. return "NVPTXISD::LDUV4";
  609. case NVPTXISD::StoreV2:
  610. return "NVPTXISD::StoreV2";
  611. case NVPTXISD::StoreV4:
  612. return "NVPTXISD::StoreV4";
  613. case NVPTXISD::FUN_SHFL_CLAMP:
  614. return "NVPTXISD::FUN_SHFL_CLAMP";
  615. case NVPTXISD::FUN_SHFR_CLAMP:
  616. return "NVPTXISD::FUN_SHFR_CLAMP";
  617. case NVPTXISD::IMAD:
  618. return "NVPTXISD::IMAD";
  619. case NVPTXISD::SETP_F16X2:
  620. return "NVPTXISD::SETP_F16X2";
  621. case NVPTXISD::Dummy:
  622. return "NVPTXISD::Dummy";
  623. case NVPTXISD::MUL_WIDE_SIGNED:
  624. return "NVPTXISD::MUL_WIDE_SIGNED";
  625. case NVPTXISD::MUL_WIDE_UNSIGNED:
  626. return "NVPTXISD::MUL_WIDE_UNSIGNED";
  627. case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32";
  628. case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat";
  629. case NVPTXISD::Tex1DFloatFloatLevel:
  630. return "NVPTXISD::Tex1DFloatFloatLevel";
  631. case NVPTXISD::Tex1DFloatFloatGrad:
  632. return "NVPTXISD::Tex1DFloatFloatGrad";
  633. case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32";
  634. case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float";
  635. case NVPTXISD::Tex1DS32FloatLevel:
  636. return "NVPTXISD::Tex1DS32FloatLevel";
  637. case NVPTXISD::Tex1DS32FloatGrad:
  638. return "NVPTXISD::Tex1DS32FloatGrad";
  639. case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32";
  640. case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float";
  641. case NVPTXISD::Tex1DU32FloatLevel:
  642. return "NVPTXISD::Tex1DU32FloatLevel";
  643. case NVPTXISD::Tex1DU32FloatGrad:
  644. return "NVPTXISD::Tex1DU32FloatGrad";
  645. case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32";
  646. case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
  647. case NVPTXISD::Tex1DArrayFloatFloatLevel:
  648. return "NVPTXISD::Tex1DArrayFloatFloatLevel";
  649. case NVPTXISD::Tex1DArrayFloatFloatGrad:
  650. return "NVPTXISD::Tex1DArrayFloatFloatGrad";
  651. case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32";
  652. case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float";
  653. case NVPTXISD::Tex1DArrayS32FloatLevel:
  654. return "NVPTXISD::Tex1DArrayS32FloatLevel";
  655. case NVPTXISD::Tex1DArrayS32FloatGrad:
  656. return "NVPTXISD::Tex1DArrayS32FloatGrad";
  657. case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32";
  658. case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float";
  659. case NVPTXISD::Tex1DArrayU32FloatLevel:
  660. return "NVPTXISD::Tex1DArrayU32FloatLevel";
  661. case NVPTXISD::Tex1DArrayU32FloatGrad:
  662. return "NVPTXISD::Tex1DArrayU32FloatGrad";
  663. case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32";
  664. case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat";
  665. case NVPTXISD::Tex2DFloatFloatLevel:
  666. return "NVPTXISD::Tex2DFloatFloatLevel";
  667. case NVPTXISD::Tex2DFloatFloatGrad:
  668. return "NVPTXISD::Tex2DFloatFloatGrad";
  669. case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32";
  670. case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float";
  671. case NVPTXISD::Tex2DS32FloatLevel:
  672. return "NVPTXISD::Tex2DS32FloatLevel";
  673. case NVPTXISD::Tex2DS32FloatGrad:
  674. return "NVPTXISD::Tex2DS32FloatGrad";
  675. case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32";
  676. case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float";
  677. case NVPTXISD::Tex2DU32FloatLevel:
  678. return "NVPTXISD::Tex2DU32FloatLevel";
  679. case NVPTXISD::Tex2DU32FloatGrad:
  680. return "NVPTXISD::Tex2DU32FloatGrad";
  681. case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32";
  682. case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
  683. case NVPTXISD::Tex2DArrayFloatFloatLevel:
  684. return "NVPTXISD::Tex2DArrayFloatFloatLevel";
  685. case NVPTXISD::Tex2DArrayFloatFloatGrad:
  686. return "NVPTXISD::Tex2DArrayFloatFloatGrad";
  687. case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32";
  688. case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float";
  689. case NVPTXISD::Tex2DArrayS32FloatLevel:
  690. return "NVPTXISD::Tex2DArrayS32FloatLevel";
  691. case NVPTXISD::Tex2DArrayS32FloatGrad:
  692. return "NVPTXISD::Tex2DArrayS32FloatGrad";
  693. case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32";
  694. case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float";
  695. case NVPTXISD::Tex2DArrayU32FloatLevel:
  696. return "NVPTXISD::Tex2DArrayU32FloatLevel";
  697. case NVPTXISD::Tex2DArrayU32FloatGrad:
  698. return "NVPTXISD::Tex2DArrayU32FloatGrad";
  699. case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32";
  700. case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat";
  701. case NVPTXISD::Tex3DFloatFloatLevel:
  702. return "NVPTXISD::Tex3DFloatFloatLevel";
  703. case NVPTXISD::Tex3DFloatFloatGrad:
  704. return "NVPTXISD::Tex3DFloatFloatGrad";
  705. case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32";
  706. case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float";
  707. case NVPTXISD::Tex3DS32FloatLevel:
  708. return "NVPTXISD::Tex3DS32FloatLevel";
  709. case NVPTXISD::Tex3DS32FloatGrad:
  710. return "NVPTXISD::Tex3DS32FloatGrad";
  711. case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32";
  712. case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float";
  713. case NVPTXISD::Tex3DU32FloatLevel:
  714. return "NVPTXISD::Tex3DU32FloatLevel";
  715. case NVPTXISD::Tex3DU32FloatGrad:
  716. return "NVPTXISD::Tex3DU32FloatGrad";
  717. case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat";
  718. case NVPTXISD::TexCubeFloatFloatLevel:
  719. return "NVPTXISD::TexCubeFloatFloatLevel";
  720. case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float";
  721. case NVPTXISD::TexCubeS32FloatLevel:
  722. return "NVPTXISD::TexCubeS32FloatLevel";
  723. case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float";
  724. case NVPTXISD::TexCubeU32FloatLevel:
  725. return "NVPTXISD::TexCubeU32FloatLevel";
  726. case NVPTXISD::TexCubeArrayFloatFloat:
  727. return "NVPTXISD::TexCubeArrayFloatFloat";
  728. case NVPTXISD::TexCubeArrayFloatFloatLevel:
  729. return "NVPTXISD::TexCubeArrayFloatFloatLevel";
  730. case NVPTXISD::TexCubeArrayS32Float:
  731. return "NVPTXISD::TexCubeArrayS32Float";
  732. case NVPTXISD::TexCubeArrayS32FloatLevel:
  733. return "NVPTXISD::TexCubeArrayS32FloatLevel";
  734. case NVPTXISD::TexCubeArrayU32Float:
  735. return "NVPTXISD::TexCubeArrayU32Float";
  736. case NVPTXISD::TexCubeArrayU32FloatLevel:
  737. return "NVPTXISD::TexCubeArrayU32FloatLevel";
  738. case NVPTXISD::Tld4R2DFloatFloat:
  739. return "NVPTXISD::Tld4R2DFloatFloat";
  740. case NVPTXISD::Tld4G2DFloatFloat:
  741. return "NVPTXISD::Tld4G2DFloatFloat";
  742. case NVPTXISD::Tld4B2DFloatFloat:
  743. return "NVPTXISD::Tld4B2DFloatFloat";
  744. case NVPTXISD::Tld4A2DFloatFloat:
  745. return "NVPTXISD::Tld4A2DFloatFloat";
  746. case NVPTXISD::Tld4R2DS64Float:
  747. return "NVPTXISD::Tld4R2DS64Float";
  748. case NVPTXISD::Tld4G2DS64Float:
  749. return "NVPTXISD::Tld4G2DS64Float";
  750. case NVPTXISD::Tld4B2DS64Float:
  751. return "NVPTXISD::Tld4B2DS64Float";
  752. case NVPTXISD::Tld4A2DS64Float:
  753. return "NVPTXISD::Tld4A2DS64Float";
  754. case NVPTXISD::Tld4R2DU64Float:
  755. return "NVPTXISD::Tld4R2DU64Float";
  756. case NVPTXISD::Tld4G2DU64Float:
  757. return "NVPTXISD::Tld4G2DU64Float";
  758. case NVPTXISD::Tld4B2DU64Float:
  759. return "NVPTXISD::Tld4B2DU64Float";
  760. case NVPTXISD::Tld4A2DU64Float:
  761. return "NVPTXISD::Tld4A2DU64Float";
  762. case NVPTXISD::TexUnified1DFloatS32:
  763. return "NVPTXISD::TexUnified1DFloatS32";
  764. case NVPTXISD::TexUnified1DFloatFloat:
  765. return "NVPTXISD::TexUnified1DFloatFloat";
  766. case NVPTXISD::TexUnified1DFloatFloatLevel:
  767. return "NVPTXISD::TexUnified1DFloatFloatLevel";
  768. case NVPTXISD::TexUnified1DFloatFloatGrad:
  769. return "NVPTXISD::TexUnified1DFloatFloatGrad";
  770. case NVPTXISD::TexUnified1DS32S32:
  771. return "NVPTXISD::TexUnified1DS32S32";
  772. case NVPTXISD::TexUnified1DS32Float:
  773. return "NVPTXISD::TexUnified1DS32Float";
  774. case NVPTXISD::TexUnified1DS32FloatLevel:
  775. return "NVPTXISD::TexUnified1DS32FloatLevel";
  776. case NVPTXISD::TexUnified1DS32FloatGrad:
  777. return "NVPTXISD::TexUnified1DS32FloatGrad";
  778. case NVPTXISD::TexUnified1DU32S32:
  779. return "NVPTXISD::TexUnified1DU32S32";
  780. case NVPTXISD::TexUnified1DU32Float:
  781. return "NVPTXISD::TexUnified1DU32Float";
  782. case NVPTXISD::TexUnified1DU32FloatLevel:
  783. return "NVPTXISD::TexUnified1DU32FloatLevel";
  784. case NVPTXISD::TexUnified1DU32FloatGrad:
  785. return "NVPTXISD::TexUnified1DU32FloatGrad";
  786. case NVPTXISD::TexUnified1DArrayFloatS32:
  787. return "NVPTXISD::TexUnified1DArrayFloatS32";
  788. case NVPTXISD::TexUnified1DArrayFloatFloat:
  789. return "NVPTXISD::TexUnified1DArrayFloatFloat";
  790. case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
  791. return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
  792. case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
  793. return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
  794. case NVPTXISD::TexUnified1DArrayS32S32:
  795. return "NVPTXISD::TexUnified1DArrayS32S32";
  796. case NVPTXISD::TexUnified1DArrayS32Float:
  797. return "NVPTXISD::TexUnified1DArrayS32Float";
  798. case NVPTXISD::TexUnified1DArrayS32FloatLevel:
  799. return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
  800. case NVPTXISD::TexUnified1DArrayS32FloatGrad:
  801. return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
  802. case NVPTXISD::TexUnified1DArrayU32S32:
  803. return "NVPTXISD::TexUnified1DArrayU32S32";
  804. case NVPTXISD::TexUnified1DArrayU32Float:
  805. return "NVPTXISD::TexUnified1DArrayU32Float";
  806. case NVPTXISD::TexUnified1DArrayU32FloatLevel:
  807. return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
  808. case NVPTXISD::TexUnified1DArrayU32FloatGrad:
  809. return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
  810. case NVPTXISD::TexUnified2DFloatS32:
  811. return "NVPTXISD::TexUnified2DFloatS32";
  812. case NVPTXISD::TexUnified2DFloatFloat:
  813. return "NVPTXISD::TexUnified2DFloatFloat";
  814. case NVPTXISD::TexUnified2DFloatFloatLevel:
  815. return "NVPTXISD::TexUnified2DFloatFloatLevel";
  816. case NVPTXISD::TexUnified2DFloatFloatGrad:
  817. return "NVPTXISD::TexUnified2DFloatFloatGrad";
  818. case NVPTXISD::TexUnified2DS32S32:
  819. return "NVPTXISD::TexUnified2DS32S32";
  820. case NVPTXISD::TexUnified2DS32Float:
  821. return "NVPTXISD::TexUnified2DS32Float";
  822. case NVPTXISD::TexUnified2DS32FloatLevel:
  823. return "NVPTXISD::TexUnified2DS32FloatLevel";
  824. case NVPTXISD::TexUnified2DS32FloatGrad:
  825. return "NVPTXISD::TexUnified2DS32FloatGrad";
  826. case NVPTXISD::TexUnified2DU32S32:
  827. return "NVPTXISD::TexUnified2DU32S32";
  828. case NVPTXISD::TexUnified2DU32Float:
  829. return "NVPTXISD::TexUnified2DU32Float";
  830. case NVPTXISD::TexUnified2DU32FloatLevel:
  831. return "NVPTXISD::TexUnified2DU32FloatLevel";
  832. case NVPTXISD::TexUnified2DU32FloatGrad:
  833. return "NVPTXISD::TexUnified2DU32FloatGrad";
  834. case NVPTXISD::TexUnified2DArrayFloatS32:
  835. return "NVPTXISD::TexUnified2DArrayFloatS32";
  836. case NVPTXISD::TexUnified2DArrayFloatFloat:
  837. return "NVPTXISD::TexUnified2DArrayFloatFloat";
  838. case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
  839. return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
  840. case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
  841. return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
  842. case NVPTXISD::TexUnified2DArrayS32S32:
  843. return "NVPTXISD::TexUnified2DArrayS32S32";
  844. case NVPTXISD::TexUnified2DArrayS32Float:
  845. return "NVPTXISD::TexUnified2DArrayS32Float";
  846. case NVPTXISD::TexUnified2DArrayS32FloatLevel:
  847. return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
  848. case NVPTXISD::TexUnified2DArrayS32FloatGrad:
  849. return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
  850. case NVPTXISD::TexUnified2DArrayU32S32:
  851. return "NVPTXISD::TexUnified2DArrayU32S32";
  852. case NVPTXISD::TexUnified2DArrayU32Float:
  853. return "NVPTXISD::TexUnified2DArrayU32Float";
  854. case NVPTXISD::TexUnified2DArrayU32FloatLevel:
  855. return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
  856. case NVPTXISD::TexUnified2DArrayU32FloatGrad:
  857. return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
  858. case NVPTXISD::TexUnified3DFloatS32:
  859. return "NVPTXISD::TexUnified3DFloatS32";
  860. case NVPTXISD::TexUnified3DFloatFloat:
  861. return "NVPTXISD::TexUnified3DFloatFloat";
  862. case NVPTXISD::TexUnified3DFloatFloatLevel:
  863. return "NVPTXISD::TexUnified3DFloatFloatLevel";
  864. case NVPTXISD::TexUnified3DFloatFloatGrad:
  865. return "NVPTXISD::TexUnified3DFloatFloatGrad";
  866. case NVPTXISD::TexUnified3DS32S32:
  867. return "NVPTXISD::TexUnified3DS32S32";
  868. case NVPTXISD::TexUnified3DS32Float:
  869. return "NVPTXISD::TexUnified3DS32Float";
  870. case NVPTXISD::TexUnified3DS32FloatLevel:
  871. return "NVPTXISD::TexUnified3DS32FloatLevel";
  872. case NVPTXISD::TexUnified3DS32FloatGrad:
  873. return "NVPTXISD::TexUnified3DS32FloatGrad";
  874. case NVPTXISD::TexUnified3DU32S32:
  875. return "NVPTXISD::TexUnified3DU32S32";
  876. case NVPTXISD::TexUnified3DU32Float:
  877. return "NVPTXISD::TexUnified3DU32Float";
  878. case NVPTXISD::TexUnified3DU32FloatLevel:
  879. return "NVPTXISD::TexUnified3DU32FloatLevel";
  880. case NVPTXISD::TexUnified3DU32FloatGrad:
  881. return "NVPTXISD::TexUnified3DU32FloatGrad";
  882. case NVPTXISD::TexUnifiedCubeFloatFloat:
  883. return "NVPTXISD::TexUnifiedCubeFloatFloat";
  884. case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
  885. return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
  886. case NVPTXISD::TexUnifiedCubeS32Float:
  887. return "NVPTXISD::TexUnifiedCubeS32Float";
  888. case NVPTXISD::TexUnifiedCubeS32FloatLevel:
  889. return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
  890. case NVPTXISD::TexUnifiedCubeU32Float:
  891. return "NVPTXISD::TexUnifiedCubeU32Float";
  892. case NVPTXISD::TexUnifiedCubeU32FloatLevel:
  893. return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
  894. case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
  895. return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
  896. case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
  897. return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
  898. case NVPTXISD::TexUnifiedCubeArrayS32Float:
  899. return "NVPTXISD::TexUnifiedCubeArrayS32Float";
  900. case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
  901. return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
  902. case NVPTXISD::TexUnifiedCubeArrayU32Float:
  903. return "NVPTXISD::TexUnifiedCubeArrayU32Float";
  904. case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
  905. return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
  906. case NVPTXISD::Tld4UnifiedR2DFloatFloat:
  907. return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
  908. case NVPTXISD::Tld4UnifiedG2DFloatFloat:
  909. return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
  910. case NVPTXISD::Tld4UnifiedB2DFloatFloat:
  911. return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
  912. case NVPTXISD::Tld4UnifiedA2DFloatFloat:
  913. return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
  914. case NVPTXISD::Tld4UnifiedR2DS64Float:
  915. return "NVPTXISD::Tld4UnifiedR2DS64Float";
  916. case NVPTXISD::Tld4UnifiedG2DS64Float:
  917. return "NVPTXISD::Tld4UnifiedG2DS64Float";
  918. case NVPTXISD::Tld4UnifiedB2DS64Float:
  919. return "NVPTXISD::Tld4UnifiedB2DS64Float";
  920. case NVPTXISD::Tld4UnifiedA2DS64Float:
  921. return "NVPTXISD::Tld4UnifiedA2DS64Float";
  922. case NVPTXISD::Tld4UnifiedR2DU64Float:
  923. return "NVPTXISD::Tld4UnifiedR2DU64Float";
  924. case NVPTXISD::Tld4UnifiedG2DU64Float:
  925. return "NVPTXISD::Tld4UnifiedG2DU64Float";
  926. case NVPTXISD::Tld4UnifiedB2DU64Float:
  927. return "NVPTXISD::Tld4UnifiedB2DU64Float";
  928. case NVPTXISD::Tld4UnifiedA2DU64Float:
  929. return "NVPTXISD::Tld4UnifiedA2DU64Float";
  930. case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp";
  931. case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp";
  932. case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp";
  933. case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp";
  934. case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp";
  935. case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp";
  936. case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp";
  937. case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp";
  938. case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp";
  939. case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp";
  940. case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp";
  941. case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp";
  942. case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp";
  943. case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp";
  944. case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp";
  945. case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
  946. case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
  947. case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
  948. case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
  949. case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
  950. case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
  951. case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
  952. case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp";
  953. case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp";
  954. case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp";
  955. case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp";
  956. case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp";
  957. case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp";
  958. case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp";
  959. case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp";
  960. case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp";
  961. case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp";
  962. case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp";
  963. case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp";
  964. case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp";
  965. case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp";
  966. case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp";
  967. case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
  968. case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
  969. case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
  970. case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
  971. case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
  972. case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
  973. case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
  974. case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp";
  975. case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp";
  976. case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp";
  977. case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp";
  978. case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp";
  979. case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp";
  980. case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp";
  981. case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp";
  982. case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp";
  983. case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp";
  984. case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp";
  985. case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap";
  986. case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap";
  987. case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap";
  988. case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap";
  989. case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap";
  990. case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap";
  991. case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap";
  992. case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap";
  993. case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap";
  994. case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap";
  995. case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap";
  996. case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap";
  997. case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap";
  998. case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap";
  999. case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap";
  1000. case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap";
  1001. case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap";
  1002. case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap";
  1003. case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap";
  1004. case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap";
  1005. case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap";
  1006. case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap";
  1007. case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap";
  1008. case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap";
  1009. case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap";
  1010. case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap";
  1011. case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap";
  1012. case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap";
  1013. case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap";
  1014. case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap";
  1015. case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap";
  1016. case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap";
  1017. case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap";
  1018. case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap";
  1019. case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap";
  1020. case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap";
  1021. case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap";
  1022. case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap";
  1023. case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap";
  1024. case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap";
  1025. case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap";
  1026. case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap";
  1027. case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap";
  1028. case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap";
  1029. case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap";
  1030. case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap";
  1031. case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap";
  1032. case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap";
  1033. case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap";
  1034. case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap";
  1035. case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap";
  1036. case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap";
  1037. case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap";
  1038. case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap";
  1039. case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap";
  1040. case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero";
  1041. case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero";
  1042. case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero";
  1043. case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero";
  1044. case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero";
  1045. case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero";
  1046. case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero";
  1047. case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero";
  1048. case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero";
  1049. case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero";
  1050. case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero";
  1051. case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero";
  1052. case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero";
  1053. case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero";
  1054. case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero";
  1055. case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero";
  1056. case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero";
  1057. case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero";
  1058. case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero";
  1059. case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero";
  1060. case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero";
  1061. case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero";
  1062. case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero";
  1063. case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero";
  1064. case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero";
  1065. case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero";
  1066. case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero";
  1067. case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero";
  1068. case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero";
  1069. case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero";
  1070. case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero";
  1071. case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero";
  1072. case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero";
  1073. case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero";
  1074. case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero";
  1075. case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero";
  1076. case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero";
  1077. case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero";
  1078. case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero";
  1079. case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero";
  1080. case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero";
  1081. case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero";
  1082. case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero";
  1083. case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero";
  1084. case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero";
  1085. case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero";
  1086. case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero";
  1087. case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero";
  1088. case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero";
  1089. case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero";
  1090. case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero";
  1091. case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero";
  1092. case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero";
  1093. case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero";
  1094. case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero";
  1095. }
  1096. return nullptr;
  1097. }
  1098. TargetLoweringBase::LegalizeTypeAction
  1099. NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const {
  1100. if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
  1101. VT.getScalarType() == MVT::i1)
  1102. return TypeSplitVector;
  1103. if (VT == MVT::v2f16)
  1104. return TypeLegal;
  1105. return TargetLoweringBase::getPreferredVectorAction(VT);
  1106. }
  1107. SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
  1108. int Enabled, int &ExtraSteps,
  1109. bool &UseOneConst,
  1110. bool Reciprocal) const {
  1111. if (!(Enabled == ReciprocalEstimate::Enabled ||
  1112. (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
  1113. return SDValue();
  1114. if (ExtraSteps == ReciprocalEstimate::Unspecified)
  1115. ExtraSteps = 0;
  1116. SDLoc DL(Operand);
  1117. EVT VT = Operand.getValueType();
  1118. bool Ftz = useF32FTZ(DAG.getMachineFunction());
  1119. auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
  1120. return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
  1121. DAG.getConstant(IID, DL, MVT::i32), Operand);
  1122. };
  1123. // The sqrt and rsqrt refinement processes assume we always start out with an
  1124. // approximation of the rsqrt. Therefore, if we're going to do any refinement
  1125. // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
  1126. // any refinement, we must return a regular sqrt.
  1127. if (Reciprocal || ExtraSteps > 0) {
  1128. if (VT == MVT::f32)
  1129. return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
  1130. : Intrinsic::nvvm_rsqrt_approx_f);
  1131. else if (VT == MVT::f64)
  1132. return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
  1133. else
  1134. return SDValue();
  1135. } else {
  1136. if (VT == MVT::f32)
  1137. return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
  1138. : Intrinsic::nvvm_sqrt_approx_f);
  1139. else {
  1140. // There's no sqrt.approx.f64 instruction, so we emit
  1141. // reciprocal(rsqrt(x)). This is faster than
  1142. // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
  1143. // x * rsqrt(x).)
  1144. return DAG.getNode(
  1145. ISD::INTRINSIC_WO_CHAIN, DL, VT,
  1146. DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
  1147. MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
  1148. }
  1149. }
  1150. }
  1151. SDValue
  1152. NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
  1153. SDLoc dl(Op);
  1154. const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
  1155. auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
  1156. Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
  1157. return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
  1158. }
  1159. std::string NVPTXTargetLowering::getPrototype(
  1160. const DataLayout &DL, Type *retTy, const ArgListTy &Args,
  1161. const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
  1162. const CallBase &CB, unsigned UniqueCallSite) const {
  1163. auto PtrVT = getPointerTy(DL);
  1164. bool isABI = (STI.getSmVersion() >= 20);
  1165. assert(isABI && "Non-ABI compilation is not supported");
  1166. if (!isABI)
  1167. return "";
  1168. std::stringstream O;
  1169. O << "prototype_" << UniqueCallSite << " : .callprototype ";
  1170. if (retTy->getTypeID() == Type::VoidTyID) {
  1171. O << "()";
  1172. } else {
  1173. O << "(";
  1174. if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) {
  1175. unsigned size = 0;
  1176. if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
  1177. size = ITy->getBitWidth();
  1178. } else {
  1179. assert(retTy->isFloatingPointTy() &&
  1180. "Floating point type expected here");
  1181. size = retTy->getPrimitiveSizeInBits();
  1182. }
  1183. // PTX ABI requires all scalar return values to be at least 32
  1184. // bits in size. fp16 normally uses .b16 as its storage type in
  1185. // PTX, so its size must be adjusted here, too.
  1186. if (size < 32)
  1187. size = 32;
  1188. O << ".param .b" << size << " _";
  1189. } else if (isa<PointerType>(retTy)) {
  1190. O << ".param .b" << PtrVT.getSizeInBits() << " _";
  1191. } else if (retTy->isAggregateType() || retTy->isVectorTy() ||
  1192. retTy->isIntegerTy(128)) {
  1193. O << ".param .align " << (retAlignment ? retAlignment->value() : 0)
  1194. << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]";
  1195. } else {
  1196. llvm_unreachable("Unknown return type");
  1197. }
  1198. O << ") ";
  1199. }
  1200. O << "_ (";
  1201. bool first = true;
  1202. unsigned OIdx = 0;
  1203. for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
  1204. Type *Ty = Args[i].Ty;
  1205. if (!first) {
  1206. O << ", ";
  1207. }
  1208. first = false;
  1209. if (!Outs[OIdx].Flags.isByVal()) {
  1210. if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
  1211. unsigned align = 0;
  1212. const CallInst *CallI = cast<CallInst>(&CB);
  1213. // +1 because index 0 is reserved for return type alignment
  1214. if (!getAlign(*CallI, i + 1, align))
  1215. align = DL.getABITypeAlignment(Ty);
  1216. unsigned sz = DL.getTypeAllocSize(Ty);
  1217. O << ".param .align " << align << " .b8 ";
  1218. O << "_";
  1219. O << "[" << sz << "]";
  1220. // update the index for Outs
  1221. SmallVector<EVT, 16> vtparts;
  1222. ComputeValueVTs(*this, DL, Ty, vtparts);
  1223. if (unsigned len = vtparts.size())
  1224. OIdx += len - 1;
  1225. continue;
  1226. }
  1227. // i8 types in IR will be i16 types in SDAG
  1228. assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
  1229. (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
  1230. "type mismatch between callee prototype and arguments");
  1231. // scalar type
  1232. unsigned sz = 0;
  1233. if (isa<IntegerType>(Ty)) {
  1234. sz = cast<IntegerType>(Ty)->getBitWidth();
  1235. if (sz < 32)
  1236. sz = 32;
  1237. } else if (isa<PointerType>(Ty)) {
  1238. sz = PtrVT.getSizeInBits();
  1239. } else if (Ty->isHalfTy())
  1240. // PTX ABI requires all scalar parameters to be at least 32
  1241. // bits in size. fp16 normally uses .b16 as its storage type
  1242. // in PTX, so its size must be adjusted here, too.
  1243. sz = 32;
  1244. else
  1245. sz = Ty->getPrimitiveSizeInBits();
  1246. O << ".param .b" << sz << " ";
  1247. O << "_";
  1248. continue;
  1249. }
  1250. auto *PTy = dyn_cast<PointerType>(Ty);
  1251. assert(PTy && "Param with byval attribute should be a pointer type");
  1252. Type *ETy = PTy->getPointerElementType();
  1253. Align align = Outs[OIdx].Flags.getNonZeroByValAlign();
  1254. unsigned sz = DL.getTypeAllocSize(ETy);
  1255. O << ".param .align " << align.value() << " .b8 ";
  1256. O << "_";
  1257. O << "[" << sz << "]";
  1258. }
  1259. O << ");";
  1260. return O.str();
  1261. }
  1262. Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
  1263. const CallBase *CB, Type *Ty,
  1264. unsigned Idx,
  1265. const DataLayout &DL) const {
  1266. if (!CB) {
  1267. // CallSite is zero, fallback to ABI type alignment
  1268. return DL.getABITypeAlign(Ty);
  1269. }
  1270. unsigned Alignment = 0;
  1271. const Function *DirectCallee = CB->getCalledFunction();
  1272. if (!DirectCallee) {
  1273. // We don't have a direct function symbol, but that may be because of
  1274. // constant cast instructions in the call.
  1275. // With bitcast'd call targets, the instruction will be the call
  1276. if (const auto *CI = dyn_cast<CallInst>(CB)) {
  1277. // Check if we have call alignment metadata
  1278. if (getAlign(*CI, Idx, Alignment))
  1279. return Align(Alignment);
  1280. const Value *CalleeV = CI->getCalledOperand();
  1281. // Ignore any bitcast instructions
  1282. while (isa<ConstantExpr>(CalleeV)) {
  1283. const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
  1284. if (!CE->isCast())
  1285. break;
  1286. // Look through the bitcast
  1287. CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
  1288. }
  1289. // We have now looked past all of the bitcasts. Do we finally have a
  1290. // Function?
  1291. if (const auto *CalleeF = dyn_cast<Function>(CalleeV))
  1292. DirectCallee = CalleeF;
  1293. }
  1294. }
  1295. // Check for function alignment information if we found that the
  1296. // ultimate target is a Function
  1297. if (DirectCallee)
  1298. if (getAlign(*DirectCallee, Idx, Alignment))
  1299. return Align(Alignment);
  1300. // Call is indirect or alignment information is not available, fall back to
  1301. // the ABI type alignment
  1302. return DL.getABITypeAlign(Ty);
  1303. }
  1304. SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
  1305. SmallVectorImpl<SDValue> &InVals) const {
  1306. SelectionDAG &DAG = CLI.DAG;
  1307. SDLoc dl = CLI.DL;
  1308. SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
  1309. SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
  1310. SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
  1311. SDValue Chain = CLI.Chain;
  1312. SDValue Callee = CLI.Callee;
  1313. bool &isTailCall = CLI.IsTailCall;
  1314. ArgListTy &Args = CLI.getArgs();
  1315. Type *RetTy = CLI.RetTy;
  1316. const CallBase *CB = CLI.CB;
  1317. const DataLayout &DL = DAG.getDataLayout();
  1318. bool isABI = (STI.getSmVersion() >= 20);
  1319. assert(isABI && "Non-ABI compilation is not supported");
  1320. if (!isABI)
  1321. return Chain;
  1322. unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1);
  1323. SDValue tempChain = Chain;
  1324. Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
  1325. SDValue InFlag = Chain.getValue(1);
  1326. unsigned paramCount = 0;
  1327. // Args.size() and Outs.size() need not match.
  1328. // Outs.size() will be larger
  1329. // * if there is an aggregate argument with multiple fields (each field
  1330. // showing up separately in Outs)
  1331. // * if there is a vector argument with more than typical vector-length
  1332. // elements (generally if more than 4) where each vector element is
  1333. // individually present in Outs.
  1334. // So a different index should be used for indexing into Outs/OutVals.
  1335. // See similar issue in LowerFormalArguments.
  1336. unsigned OIdx = 0;
  1337. // Declare the .params or .reg need to pass values
  1338. // to the function
  1339. for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
  1340. EVT VT = Outs[OIdx].VT;
  1341. Type *Ty = Args[i].Ty;
  1342. if (!Outs[OIdx].Flags.isByVal()) {
  1343. SmallVector<EVT, 16> VTs;
  1344. SmallVector<uint64_t, 16> Offsets;
  1345. ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets);
  1346. Align ArgAlign = getArgumentAlignment(Callee, CB, Ty, paramCount + 1, DL);
  1347. unsigned AllocSize = DL.getTypeAllocSize(Ty);
  1348. SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
  1349. bool NeedAlign; // Does argument declaration specify alignment?
  1350. if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
  1351. // declare .param .align <align> .b8 .param<n>[<size>];
  1352. SDValue DeclareParamOps[] = {
  1353. Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
  1354. DAG.getConstant(paramCount, dl, MVT::i32),
  1355. DAG.getConstant(AllocSize, dl, MVT::i32), InFlag};
  1356. Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
  1357. DeclareParamOps);
  1358. NeedAlign = true;
  1359. } else {
  1360. // declare .param .b<size> .param<n>;
  1361. if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) {
  1362. // PTX ABI requires integral types to be at least 32 bits in
  1363. // size. FP16 is loaded/stored using i16, so it's handled
  1364. // here as well.
  1365. AllocSize = 4;
  1366. }
  1367. SDValue DeclareScalarParamOps[] = {
  1368. Chain, DAG.getConstant(paramCount, dl, MVT::i32),
  1369. DAG.getConstant(AllocSize * 8, dl, MVT::i32),
  1370. DAG.getConstant(0, dl, MVT::i32), InFlag};
  1371. Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
  1372. DeclareScalarParamOps);
  1373. NeedAlign = false;
  1374. }
  1375. InFlag = Chain.getValue(1);
  1376. // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
  1377. // than 32-bits are sign extended or zero extended, depending on
  1378. // whether they are signed or unsigned types. This case applies
  1379. // only to scalar parameters and not to aggregate values.
  1380. bool ExtendIntegerParam =
  1381. Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
  1382. auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
  1383. SmallVector<SDValue, 6> StoreOperands;
  1384. for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
  1385. // New store.
  1386. if (VectorInfo[j] & PVF_FIRST) {
  1387. assert(StoreOperands.empty() && "Unfinished preceding store.");
  1388. StoreOperands.push_back(Chain);
  1389. StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
  1390. StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32));
  1391. }
  1392. EVT EltVT = VTs[j];
  1393. SDValue StVal = OutVals[OIdx];
  1394. if (ExtendIntegerParam) {
  1395. assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
  1396. // zext/sext to i32
  1397. StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
  1398. : ISD::ZERO_EXTEND,
  1399. dl, MVT::i32, StVal);
  1400. } else if (EltVT.getSizeInBits() < 16) {
  1401. // Use 16-bit registers for small stores as it's the
  1402. // smallest general purpose register size supported by NVPTX.
  1403. StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
  1404. }
  1405. // Record the value to store.
  1406. StoreOperands.push_back(StVal);
  1407. if (VectorInfo[j] & PVF_LAST) {
  1408. unsigned NumElts = StoreOperands.size() - 3;
  1409. NVPTXISD::NodeType Op;
  1410. switch (NumElts) {
  1411. case 1:
  1412. Op = NVPTXISD::StoreParam;
  1413. break;
  1414. case 2:
  1415. Op = NVPTXISD::StoreParamV2;
  1416. break;
  1417. case 4:
  1418. Op = NVPTXISD::StoreParamV4;
  1419. break;
  1420. default:
  1421. llvm_unreachable("Invalid vector info.");
  1422. }
  1423. StoreOperands.push_back(InFlag);
  1424. // Adjust type of the store op if we've extended the scalar
  1425. // return value.
  1426. EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j];
  1427. MaybeAlign EltAlign;
  1428. if (NeedAlign)
  1429. EltAlign = commonAlignment(ArgAlign, Offsets[j]);
  1430. Chain = DAG.getMemIntrinsicNode(
  1431. Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
  1432. TheStoreType, MachinePointerInfo(), EltAlign,
  1433. MachineMemOperand::MOStore);
  1434. InFlag = Chain.getValue(1);
  1435. // Cleanup.
  1436. StoreOperands.clear();
  1437. }
  1438. ++OIdx;
  1439. }
  1440. assert(StoreOperands.empty() && "Unfinished parameter store.");
  1441. if (VTs.size() > 0)
  1442. --OIdx;
  1443. ++paramCount;
  1444. continue;
  1445. }
  1446. // ByVal arguments
  1447. SmallVector<EVT, 16> VTs;
  1448. SmallVector<uint64_t, 16> Offsets;
  1449. auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
  1450. assert(PTy && "Type of a byval parameter should be pointer");
  1451. ComputePTXValueVTs(*this, DL, PTy->getPointerElementType(), VTs, &Offsets,
  1452. 0);
  1453. // declare .param .align <align> .b8 .param<n>[<size>];
  1454. unsigned sz = Outs[OIdx].Flags.getByValSize();
  1455. SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
  1456. Align ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
  1457. // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
  1458. // so we don't need to worry about natural alignment or not.
  1459. // See TargetLowering::LowerCallTo().
  1460. // Enforce minumum alignment of 4 to work around ptxas miscompile
  1461. // for sm_50+. See corresponding alignment adjustment in
  1462. // emitFunctionParamList() for details.
  1463. if (ArgAlign < Align(4))
  1464. ArgAlign = Align(4);
  1465. SDValue DeclareParamOps[] = {
  1466. Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
  1467. DAG.getConstant(paramCount, dl, MVT::i32),
  1468. DAG.getConstant(sz, dl, MVT::i32), InFlag};
  1469. Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
  1470. DeclareParamOps);
  1471. InFlag = Chain.getValue(1);
  1472. for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
  1473. EVT elemtype = VTs[j];
  1474. int curOffset = Offsets[j];
  1475. unsigned PartAlign = GreatestCommonDivisor64(ArgAlign.value(), curOffset);
  1476. auto PtrVT = getPointerTy(DL);
  1477. SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
  1478. DAG.getConstant(curOffset, dl, PtrVT));
  1479. SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
  1480. MachinePointerInfo(), PartAlign);
  1481. if (elemtype.getSizeInBits() < 16) {
  1482. theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
  1483. }
  1484. SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
  1485. SDValue CopyParamOps[] = { Chain,
  1486. DAG.getConstant(paramCount, dl, MVT::i32),
  1487. DAG.getConstant(curOffset, dl, MVT::i32),
  1488. theVal, InFlag };
  1489. Chain = DAG.getMemIntrinsicNode(
  1490. NVPTXISD::StoreParam, dl, CopyParamVTs, CopyParamOps, elemtype,
  1491. MachinePointerInfo(), /* Align */ None, MachineMemOperand::MOStore);
  1492. InFlag = Chain.getValue(1);
  1493. }
  1494. ++paramCount;
  1495. }
  1496. GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
  1497. MaybeAlign retAlignment = None;
  1498. // Handle Result
  1499. if (Ins.size() > 0) {
  1500. SmallVector<EVT, 16> resvtparts;
  1501. ComputeValueVTs(*this, DL, RetTy, resvtparts);
  1502. // Declare
  1503. // .param .align 16 .b8 retval0[<size-in-bytes>], or
  1504. // .param .b<size-in-bits> retval0
  1505. unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
  1506. // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
  1507. // these three types to match the logic in
  1508. // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
  1509. // Plus, this behavior is consistent with nvcc's.
  1510. if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() ||
  1511. (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) {
  1512. // Scalar needs to be at least 32bit wide
  1513. if (resultsz < 32)
  1514. resultsz = 32;
  1515. SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
  1516. SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
  1517. DAG.getConstant(resultsz, dl, MVT::i32),
  1518. DAG.getConstant(0, dl, MVT::i32), InFlag };
  1519. Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
  1520. DeclareRetOps);
  1521. InFlag = Chain.getValue(1);
  1522. } else {
  1523. retAlignment = getArgumentAlignment(Callee, CB, RetTy, 0, DL);
  1524. assert(retAlignment && "retAlignment is guaranteed to be set");
  1525. SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
  1526. SDValue DeclareRetOps[] = {
  1527. Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),
  1528. DAG.getConstant(resultsz / 8, dl, MVT::i32),
  1529. DAG.getConstant(0, dl, MVT::i32), InFlag};
  1530. Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
  1531. DeclareRetOps);
  1532. InFlag = Chain.getValue(1);
  1533. }
  1534. }
  1535. // Both indirect calls and libcalls have nullptr Func. In order to distinguish
  1536. // between them we must rely on the call site value which is valid for
  1537. // indirect calls but is always null for libcalls.
  1538. bool isIndirectCall = !Func && CB;
  1539. if (isa<ExternalSymbolSDNode>(Callee)) {
  1540. Function* CalleeFunc = nullptr;
  1541. // Try to find the callee in the current module.
  1542. Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
  1543. assert(CalleeFunc != nullptr && "Libcall callee must be set.");
  1544. // Set the "libcall callee" attribute to indicate that the function
  1545. // must always have a declaration.
  1546. CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
  1547. }
  1548. if (isIndirectCall) {
  1549. // This is indirect function call case : PTX requires a prototype of the
  1550. // form
  1551. // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
  1552. // to be emitted, and the label has to used as the last arg of call
  1553. // instruction.
  1554. // The prototype is embedded in a string and put as the operand for a
  1555. // CallPrototype SDNode which will print out to the value of the string.
  1556. SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
  1557. std::string Proto =
  1558. getPrototype(DL, RetTy, Args, Outs, retAlignment, *CB, UniqueCallSite);
  1559. const char *ProtoStr =
  1560. nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
  1561. SDValue ProtoOps[] = {
  1562. Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
  1563. };
  1564. Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
  1565. InFlag = Chain.getValue(1);
  1566. }
  1567. // Op to just print "call"
  1568. SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
  1569. SDValue PrintCallOps[] = {
  1570. Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
  1571. };
  1572. // We model convergent calls as separate opcodes.
  1573. unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni;
  1574. if (CLI.IsConvergent)
  1575. Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
  1576. : NVPTXISD::PrintConvergentCall;
  1577. Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
  1578. InFlag = Chain.getValue(1);
  1579. // Ops to print out the function name
  1580. SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
  1581. SDValue CallVoidOps[] = { Chain, Callee, InFlag };
  1582. Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
  1583. InFlag = Chain.getValue(1);
  1584. // Ops to print out the param list
  1585. SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
  1586. SDValue CallArgBeginOps[] = { Chain, InFlag };
  1587. Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
  1588. CallArgBeginOps);
  1589. InFlag = Chain.getValue(1);
  1590. for (unsigned i = 0, e = paramCount; i != e; ++i) {
  1591. unsigned opcode;
  1592. if (i == (e - 1))
  1593. opcode = NVPTXISD::LastCallArg;
  1594. else
  1595. opcode = NVPTXISD::CallArg;
  1596. SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
  1597. SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
  1598. DAG.getConstant(i, dl, MVT::i32), InFlag };
  1599. Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
  1600. InFlag = Chain.getValue(1);
  1601. }
  1602. SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
  1603. SDValue CallArgEndOps[] = { Chain,
  1604. DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
  1605. InFlag };
  1606. Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
  1607. InFlag = Chain.getValue(1);
  1608. if (isIndirectCall) {
  1609. SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
  1610. SDValue PrototypeOps[] = {
  1611. Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InFlag};
  1612. Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
  1613. InFlag = Chain.getValue(1);
  1614. }
  1615. SmallVector<SDValue, 16> ProxyRegOps;
  1616. SmallVector<Optional<MVT>, 16> ProxyRegTruncates;
  1617. // Generate loads from param memory/moves from registers for result
  1618. if (Ins.size() > 0) {
  1619. SmallVector<EVT, 16> VTs;
  1620. SmallVector<uint64_t, 16> Offsets;
  1621. ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
  1622. assert(VTs.size() == Ins.size() && "Bad value decomposition");
  1623. Align RetAlign = getArgumentAlignment(Callee, CB, RetTy, 0, DL);
  1624. auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
  1625. SmallVector<EVT, 6> LoadVTs;
  1626. int VecIdx = -1; // Index of the first element of the vector.
  1627. // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
  1628. // 32-bits are sign extended or zero extended, depending on whether
  1629. // they are signed or unsigned types.
  1630. bool ExtendIntegerRetVal =
  1631. RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
  1632. for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
  1633. bool needTruncate = false;
  1634. EVT TheLoadType = VTs[i];
  1635. EVT EltType = Ins[i].VT;
  1636. Align EltAlign = commonAlignment(RetAlign, Offsets[i]);
  1637. if (ExtendIntegerRetVal) {
  1638. TheLoadType = MVT::i32;
  1639. EltType = MVT::i32;
  1640. needTruncate = true;
  1641. } else if (TheLoadType.getSizeInBits() < 16) {
  1642. if (VTs[i].isInteger())
  1643. needTruncate = true;
  1644. EltType = MVT::i16;
  1645. }
  1646. // Record index of the very first element of the vector.
  1647. if (VectorInfo[i] & PVF_FIRST) {
  1648. assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
  1649. VecIdx = i;
  1650. }
  1651. LoadVTs.push_back(EltType);
  1652. if (VectorInfo[i] & PVF_LAST) {
  1653. unsigned NumElts = LoadVTs.size();
  1654. LoadVTs.push_back(MVT::Other);
  1655. LoadVTs.push_back(MVT::Glue);
  1656. NVPTXISD::NodeType Op;
  1657. switch (NumElts) {
  1658. case 1:
  1659. Op = NVPTXISD::LoadParam;
  1660. break;
  1661. case 2:
  1662. Op = NVPTXISD::LoadParamV2;
  1663. break;
  1664. case 4:
  1665. Op = NVPTXISD::LoadParamV4;
  1666. break;
  1667. default:
  1668. llvm_unreachable("Invalid vector info.");
  1669. }
  1670. SDValue LoadOperands[] = {
  1671. Chain, DAG.getConstant(1, dl, MVT::i32),
  1672. DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
  1673. SDValue RetVal = DAG.getMemIntrinsicNode(
  1674. Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
  1675. MachinePointerInfo(), EltAlign,
  1676. MachineMemOperand::MOLoad);
  1677. for (unsigned j = 0; j < NumElts; ++j) {
  1678. ProxyRegOps.push_back(RetVal.getValue(j));
  1679. if (needTruncate)
  1680. ProxyRegTruncates.push_back(Optional<MVT>(Ins[VecIdx + j].VT));
  1681. else
  1682. ProxyRegTruncates.push_back(Optional<MVT>());
  1683. }
  1684. Chain = RetVal.getValue(NumElts);
  1685. InFlag = RetVal.getValue(NumElts + 1);
  1686. // Cleanup
  1687. VecIdx = -1;
  1688. LoadVTs.clear();
  1689. }
  1690. }
  1691. }
  1692. Chain = DAG.getCALLSEQ_END(
  1693. Chain, DAG.getIntPtrConstant(UniqueCallSite, dl, true),
  1694. DAG.getIntPtrConstant(UniqueCallSite + 1, dl, true), InFlag, dl);
  1695. InFlag = Chain.getValue(1);
  1696. // Append ProxyReg instructions to the chain to make sure that `callseq_end`
  1697. // will not get lost. Otherwise, during libcalls expansion, the nodes can become
  1698. // dangling.
  1699. for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
  1700. SDValue Ret = DAG.getNode(
  1701. NVPTXISD::ProxyReg, dl,
  1702. DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
  1703. { Chain, ProxyRegOps[i], InFlag }
  1704. );
  1705. Chain = Ret.getValue(1);
  1706. InFlag = Ret.getValue(2);
  1707. if (ProxyRegTruncates[i].hasValue()) {
  1708. Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].getValue(), Ret);
  1709. }
  1710. InVals.push_back(Ret);
  1711. }
  1712. // set isTailCall to false for now, until we figure out how to express
  1713. // tail call optimization in PTX
  1714. isTailCall = false;
  1715. return Chain;
  1716. }
  1717. // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
  1718. // (see LegalizeDAG.cpp). This is slow and uses local memory.
  1719. // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
  1720. SDValue
  1721. NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
  1722. SDNode *Node = Op.getNode();
  1723. SDLoc dl(Node);
  1724. SmallVector<SDValue, 8> Ops;
  1725. unsigned NumOperands = Node->getNumOperands();
  1726. for (unsigned i = 0; i < NumOperands; ++i) {
  1727. SDValue SubOp = Node->getOperand(i);
  1728. EVT VVT = SubOp.getNode()->getValueType(0);
  1729. EVT EltVT = VVT.getVectorElementType();
  1730. unsigned NumSubElem = VVT.getVectorNumElements();
  1731. for (unsigned j = 0; j < NumSubElem; ++j) {
  1732. Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
  1733. DAG.getIntPtrConstant(j, dl)));
  1734. }
  1735. }
  1736. return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
  1737. }
  1738. // We can init constant f16x2 with a single .b32 move. Normally it
  1739. // would get lowered as two constant loads and vector-packing move.
  1740. // mov.b16 %h1, 0x4000;
  1741. // mov.b16 %h2, 0x3C00;
  1742. // mov.b32 %hh2, {%h2, %h1};
  1743. // Instead we want just a constant move:
  1744. // mov.b32 %hh2, 0x40003C00
  1745. //
  1746. // This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
  1747. // generates good SASS in both cases.
  1748. SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
  1749. SelectionDAG &DAG) const {
  1750. //return Op;
  1751. if (!(Op->getValueType(0) == MVT::v2f16 &&
  1752. isa<ConstantFPSDNode>(Op->getOperand(0)) &&
  1753. isa<ConstantFPSDNode>(Op->getOperand(1))))
  1754. return Op;
  1755. APInt E0 =
  1756. cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt();
  1757. APInt E1 =
  1758. cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt();
  1759. SDValue Const =
  1760. DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
  1761. return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const);
  1762. }
  1763. SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
  1764. SelectionDAG &DAG) const {
  1765. SDValue Index = Op->getOperand(1);
  1766. // Constant index will be matched by tablegen.
  1767. if (isa<ConstantSDNode>(Index.getNode()))
  1768. return Op;
  1769. // Extract individual elements and select one of them.
  1770. SDValue Vector = Op->getOperand(0);
  1771. EVT VectorVT = Vector.getValueType();
  1772. assert(VectorVT == MVT::v2f16 && "Unexpected vector type.");
  1773. EVT EltVT = VectorVT.getVectorElementType();
  1774. SDLoc dl(Op.getNode());
  1775. SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
  1776. DAG.getIntPtrConstant(0, dl));
  1777. SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
  1778. DAG.getIntPtrConstant(1, dl));
  1779. return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
  1780. ISD::CondCode::SETEQ);
  1781. }
  1782. /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
  1783. /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
  1784. /// amount, or
  1785. /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
  1786. /// amount.
  1787. SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
  1788. SelectionDAG &DAG) const {
  1789. assert(Op.getNumOperands() == 3 && "Not a double-shift!");
  1790. assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
  1791. EVT VT = Op.getValueType();
  1792. unsigned VTBits = VT.getSizeInBits();
  1793. SDLoc dl(Op);
  1794. SDValue ShOpLo = Op.getOperand(0);
  1795. SDValue ShOpHi = Op.getOperand(1);
  1796. SDValue ShAmt = Op.getOperand(2);
  1797. unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
  1798. if (VTBits == 32 && STI.getSmVersion() >= 35) {
  1799. // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
  1800. // {dHi, dLo} = {aHi, aLo} >> Amt
  1801. // dHi = aHi >> Amt
  1802. // dLo = shf.r.clamp aLo, aHi, Amt
  1803. SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
  1804. SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
  1805. ShAmt);
  1806. SDValue Ops[2] = { Lo, Hi };
  1807. return DAG.getMergeValues(Ops, dl);
  1808. }
  1809. else {
  1810. // {dHi, dLo} = {aHi, aLo} >> Amt
  1811. // - if (Amt>=size) then
  1812. // dLo = aHi >> (Amt-size)
  1813. // dHi = aHi >> Amt (this is either all 0 or all 1)
  1814. // else
  1815. // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
  1816. // dHi = aHi >> Amt
  1817. SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
  1818. DAG.getConstant(VTBits, dl, MVT::i32),
  1819. ShAmt);
  1820. SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
  1821. SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
  1822. DAG.getConstant(VTBits, dl, MVT::i32));
  1823. SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
  1824. SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
  1825. SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
  1826. SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
  1827. DAG.getConstant(VTBits, dl, MVT::i32),
  1828. ISD::SETGE);
  1829. SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
  1830. SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
  1831. SDValue Ops[2] = { Lo, Hi };
  1832. return DAG.getMergeValues(Ops, dl);
  1833. }
  1834. }
  1835. /// LowerShiftLeftParts - Lower SHL_PARTS, which
  1836. /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
  1837. /// amount, or
  1838. /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
  1839. /// amount.
  1840. SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
  1841. SelectionDAG &DAG) const {
  1842. assert(Op.getNumOperands() == 3 && "Not a double-shift!");
  1843. assert(Op.getOpcode() == ISD::SHL_PARTS);
  1844. EVT VT = Op.getValueType();
  1845. unsigned VTBits = VT.getSizeInBits();
  1846. SDLoc dl(Op);
  1847. SDValue ShOpLo = Op.getOperand(0);
  1848. SDValue ShOpHi = Op.getOperand(1);
  1849. SDValue ShAmt = Op.getOperand(2);
  1850. if (VTBits == 32 && STI.getSmVersion() >= 35) {
  1851. // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
  1852. // {dHi, dLo} = {aHi, aLo} << Amt
  1853. // dHi = shf.l.clamp aLo, aHi, Amt
  1854. // dLo = aLo << Amt
  1855. SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
  1856. ShAmt);
  1857. SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
  1858. SDValue Ops[2] = { Lo, Hi };
  1859. return DAG.getMergeValues(Ops, dl);
  1860. }
  1861. else {
  1862. // {dHi, dLo} = {aHi, aLo} << Amt
  1863. // - if (Amt>=size) then
  1864. // dLo = aLo << Amt (all 0)
  1865. // dLo = aLo << (Amt-size)
  1866. // else
  1867. // dLo = aLo << Amt
  1868. // dHi = (aHi << Amt) | (aLo >> (size-Amt))
  1869. SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
  1870. DAG.getConstant(VTBits, dl, MVT::i32),
  1871. ShAmt);
  1872. SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
  1873. SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
  1874. DAG.getConstant(VTBits, dl, MVT::i32));
  1875. SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
  1876. SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
  1877. SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
  1878. SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
  1879. DAG.getConstant(VTBits, dl, MVT::i32),
  1880. ISD::SETGE);
  1881. SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
  1882. SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
  1883. SDValue Ops[2] = { Lo, Hi };
  1884. return DAG.getMergeValues(Ops, dl);
  1885. }
  1886. }
  1887. SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
  1888. EVT VT = Op.getValueType();
  1889. if (VT == MVT::f32)
  1890. return LowerFROUND32(Op, DAG);
  1891. if (VT == MVT::f64)
  1892. return LowerFROUND64(Op, DAG);
  1893. llvm_unreachable("unhandled type");
  1894. }
  1895. // This is the the rounding method used in CUDA libdevice in C like code:
  1896. // float roundf(float A)
  1897. // {
  1898. // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
  1899. // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
  1900. // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
  1901. // }
  1902. SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
  1903. SelectionDAG &DAG) const {
  1904. SDLoc SL(Op);
  1905. SDValue A = Op.getOperand(0);
  1906. EVT VT = Op.getValueType();
  1907. SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
  1908. // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
  1909. SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
  1910. const int SignBitMask = 0x80000000;
  1911. SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
  1912. DAG.getConstant(SignBitMask, SL, MVT::i32));
  1913. const int PointFiveInBits = 0x3F000000;
  1914. SDValue PointFiveWithSignRaw =
  1915. DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
  1916. DAG.getConstant(PointFiveInBits, SL, MVT::i32));
  1917. SDValue PointFiveWithSign =
  1918. DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
  1919. SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
  1920. SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
  1921. // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
  1922. EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
  1923. SDValue IsLarge =
  1924. DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
  1925. ISD::SETOGT);
  1926. RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
  1927. // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
  1928. SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
  1929. DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
  1930. SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
  1931. return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
  1932. }
  1933. // The implementation of round(double) is similar to that of round(float) in
  1934. // that they both separate the value range into three regions and use a method
  1935. // specific to the region to round the values. However, round(double) first
  1936. // calculates the round of the absolute value and then adds the sign back while
  1937. // round(float) directly rounds the value with sign.
  1938. SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
  1939. SelectionDAG &DAG) const {
  1940. SDLoc SL(Op);
  1941. SDValue A = Op.getOperand(0);
  1942. EVT VT = Op.getValueType();
  1943. SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
  1944. // double RoundedA = (double) (int) (abs(A) + 0.5f);
  1945. SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
  1946. DAG.getConstantFP(0.5, SL, VT));
  1947. SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
  1948. // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
  1949. EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
  1950. SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
  1951. DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
  1952. RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
  1953. DAG.getConstantFP(0, SL, VT),
  1954. RoundedA);
  1955. // Add sign to rounded_A
  1956. RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
  1957. DAG.getNode(ISD::FTRUNC, SL, VT, A);
  1958. // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
  1959. SDValue IsLarge =
  1960. DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
  1961. ISD::SETOGT);
  1962. return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
  1963. }
  1964. SDValue
  1965. NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
  1966. switch (Op.getOpcode()) {
  1967. case ISD::RETURNADDR:
  1968. return SDValue();
  1969. case ISD::FRAMEADDR:
  1970. return SDValue();
  1971. case ISD::GlobalAddress:
  1972. return LowerGlobalAddress(Op, DAG);
  1973. case ISD::INTRINSIC_W_CHAIN:
  1974. return Op;
  1975. case ISD::BUILD_VECTOR:
  1976. return LowerBUILD_VECTOR(Op, DAG);
  1977. case ISD::EXTRACT_SUBVECTOR:
  1978. return Op;
  1979. case ISD::EXTRACT_VECTOR_ELT:
  1980. return LowerEXTRACT_VECTOR_ELT(Op, DAG);
  1981. case ISD::CONCAT_VECTORS:
  1982. return LowerCONCAT_VECTORS(Op, DAG);
  1983. case ISD::STORE:
  1984. return LowerSTORE(Op, DAG);
  1985. case ISD::LOAD:
  1986. return LowerLOAD(Op, DAG);
  1987. case ISD::SHL_PARTS:
  1988. return LowerShiftLeftParts(Op, DAG);
  1989. case ISD::SRA_PARTS:
  1990. case ISD::SRL_PARTS:
  1991. return LowerShiftRightParts(Op, DAG);
  1992. case ISD::SELECT:
  1993. return LowerSelect(Op, DAG);
  1994. case ISD::FROUND:
  1995. return LowerFROUND(Op, DAG);
  1996. default:
  1997. llvm_unreachable("Custom lowering not defined for operation");
  1998. }
  1999. }
  2000. SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
  2001. SDValue Op0 = Op->getOperand(0);
  2002. SDValue Op1 = Op->getOperand(1);
  2003. SDValue Op2 = Op->getOperand(2);
  2004. SDLoc DL(Op.getNode());
  2005. assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
  2006. Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
  2007. Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
  2008. SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
  2009. SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
  2010. return Trunc;
  2011. }
  2012. SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
  2013. if (Op.getValueType() == MVT::i1)
  2014. return LowerLOADi1(Op, DAG);
  2015. // v2f16 is legal, so we can't rely on legalizer to handle unaligned
  2016. // loads and have to handle it here.
  2017. if (Op.getValueType() == MVT::v2f16) {
  2018. LoadSDNode *Load = cast<LoadSDNode>(Op);
  2019. EVT MemVT = Load->getMemoryVT();
  2020. if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
  2021. MemVT, *Load->getMemOperand())) {
  2022. SDValue Ops[2];
  2023. std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
  2024. return DAG.getMergeValues(Ops, SDLoc(Op));
  2025. }
  2026. }
  2027. return SDValue();
  2028. }
  2029. // v = ld i1* addr
  2030. // =>
  2031. // v1 = ld i8* addr (-> i16)
  2032. // v = trunc i16 to i1
  2033. SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
  2034. SDNode *Node = Op.getNode();
  2035. LoadSDNode *LD = cast<LoadSDNode>(Node);
  2036. SDLoc dl(Node);
  2037. assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
  2038. assert(Node->getValueType(0) == MVT::i1 &&
  2039. "Custom lowering for i1 load only");
  2040. SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
  2041. LD->getPointerInfo(), LD->getAlignment(),
  2042. LD->getMemOperand()->getFlags());
  2043. SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
  2044. // The legalizer (the caller) is expecting two values from the legalized
  2045. // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
  2046. // in LegalizeDAG.cpp which also uses MergeValues.
  2047. SDValue Ops[] = { result, LD->getChain() };
  2048. return DAG.getMergeValues(Ops, dl);
  2049. }
  2050. SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
  2051. StoreSDNode *Store = cast<StoreSDNode>(Op);
  2052. EVT VT = Store->getMemoryVT();
  2053. if (VT == MVT::i1)
  2054. return LowerSTOREi1(Op, DAG);
  2055. // v2f16 is legal, so we can't rely on legalizer to handle unaligned
  2056. // stores and have to handle it here.
  2057. if (VT == MVT::v2f16 &&
  2058. !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
  2059. VT, *Store->getMemOperand()))
  2060. return expandUnalignedStore(Store, DAG);
  2061. if (VT.isVector())
  2062. return LowerSTOREVector(Op, DAG);
  2063. return SDValue();
  2064. }
  2065. SDValue
  2066. NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
  2067. SDNode *N = Op.getNode();
  2068. SDValue Val = N->getOperand(1);
  2069. SDLoc DL(N);
  2070. EVT ValVT = Val.getValueType();
  2071. if (ValVT.isVector()) {
  2072. // We only handle "native" vector sizes for now, e.g. <4 x double> is not
  2073. // legal. We can (and should) split that into 2 stores of <2 x double> here
  2074. // but I'm leaving that as a TODO for now.
  2075. if (!ValVT.isSimple())
  2076. return SDValue();
  2077. switch (ValVT.getSimpleVT().SimpleTy) {
  2078. default:
  2079. return SDValue();
  2080. case MVT::v2i8:
  2081. case MVT::v2i16:
  2082. case MVT::v2i32:
  2083. case MVT::v2i64:
  2084. case MVT::v2f16:
  2085. case MVT::v2f32:
  2086. case MVT::v2f64:
  2087. case MVT::v4i8:
  2088. case MVT::v4i16:
  2089. case MVT::v4i32:
  2090. case MVT::v4f16:
  2091. case MVT::v4f32:
  2092. case MVT::v8f16: // <4 x f16x2>
  2093. // This is a "native" vector type
  2094. break;
  2095. }
  2096. MemSDNode *MemSD = cast<MemSDNode>(N);
  2097. const DataLayout &TD = DAG.getDataLayout();
  2098. Align Alignment = MemSD->getAlign();
  2099. Align PrefAlign =
  2100. TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
  2101. if (Alignment < PrefAlign) {
  2102. // This store is not sufficiently aligned, so bail out and let this vector
  2103. // store be scalarized. Note that we may still be able to emit smaller
  2104. // vector stores. For example, if we are storing a <4 x float> with an
  2105. // alignment of 8, this check will fail but the legalizer will try again
  2106. // with 2 x <2 x float>, which will succeed with an alignment of 8.
  2107. return SDValue();
  2108. }
  2109. unsigned Opcode = 0;
  2110. EVT EltVT = ValVT.getVectorElementType();
  2111. unsigned NumElts = ValVT.getVectorNumElements();
  2112. // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
  2113. // Therefore, we must ensure the type is legal. For i1 and i8, we set the
  2114. // stored type to i16 and propagate the "real" type as the memory type.
  2115. bool NeedExt = false;
  2116. if (EltVT.getSizeInBits() < 16)
  2117. NeedExt = true;
  2118. bool StoreF16x2 = false;
  2119. switch (NumElts) {
  2120. default:
  2121. return SDValue();
  2122. case 2:
  2123. Opcode = NVPTXISD::StoreV2;
  2124. break;
  2125. case 4:
  2126. Opcode = NVPTXISD::StoreV4;
  2127. break;
  2128. case 8:
  2129. // v8f16 is a special case. PTX doesn't have st.v8.f16
  2130. // instruction. Instead, we split the vector into v2f16 chunks and
  2131. // store them with st.v4.b32.
  2132. assert(EltVT == MVT::f16 && "Wrong type for the vector.");
  2133. Opcode = NVPTXISD::StoreV4;
  2134. StoreF16x2 = true;
  2135. break;
  2136. }
  2137. SmallVector<SDValue, 8> Ops;
  2138. // First is the chain
  2139. Ops.push_back(N->getOperand(0));
  2140. if (StoreF16x2) {
  2141. // Combine f16,f16 -> v2f16
  2142. NumElts /= 2;
  2143. for (unsigned i = 0; i < NumElts; ++i) {
  2144. SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
  2145. DAG.getIntPtrConstant(i * 2, DL));
  2146. SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
  2147. DAG.getIntPtrConstant(i * 2 + 1, DL));
  2148. SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1);
  2149. Ops.push_back(V2);
  2150. }
  2151. } else {
  2152. // Then the split values
  2153. for (unsigned i = 0; i < NumElts; ++i) {
  2154. SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
  2155. DAG.getIntPtrConstant(i, DL));
  2156. if (NeedExt)
  2157. ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
  2158. Ops.push_back(ExtVal);
  2159. }
  2160. }
  2161. // Then any remaining arguments
  2162. Ops.append(N->op_begin() + 2, N->op_end());
  2163. SDValue NewSt =
  2164. DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
  2165. MemSD->getMemoryVT(), MemSD->getMemOperand());
  2166. // return DCI.CombineTo(N, NewSt, true);
  2167. return NewSt;
  2168. }
  2169. return SDValue();
  2170. }
  2171. // st i1 v, addr
  2172. // =>
  2173. // v1 = zxt v to i16
  2174. // st.u8 i16, addr
  2175. SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
  2176. SDNode *Node = Op.getNode();
  2177. SDLoc dl(Node);
  2178. StoreSDNode *ST = cast<StoreSDNode>(Node);
  2179. SDValue Tmp1 = ST->getChain();
  2180. SDValue Tmp2 = ST->getBasePtr();
  2181. SDValue Tmp3 = ST->getValue();
  2182. assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
  2183. Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
  2184. SDValue Result =
  2185. DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
  2186. ST->getAlignment(), ST->getMemOperand()->getFlags());
  2187. return Result;
  2188. }
  2189. SDValue
  2190. NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
  2191. std::string ParamSym;
  2192. raw_string_ostream ParamStr(ParamSym);
  2193. ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
  2194. ParamStr.flush();
  2195. std::string *SavedStr =
  2196. nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
  2197. return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
  2198. }
  2199. // Check to see if the kernel argument is image*_t or sampler_t
  2200. static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
  2201. static const char *const specialTypes[] = { "struct._image2d_t",
  2202. "struct._image3d_t",
  2203. "struct._sampler_t" };
  2204. Type *Ty = arg->getType();
  2205. auto *PTy = dyn_cast<PointerType>(Ty);
  2206. if (!PTy)
  2207. return false;
  2208. if (!context)
  2209. return false;
  2210. auto *STy = dyn_cast<StructType>(PTy->getPointerElementType());
  2211. if (!STy || STy->isLiteral())
  2212. return false;
  2213. return llvm::is_contained(specialTypes, STy->getName());
  2214. }
  2215. SDValue NVPTXTargetLowering::LowerFormalArguments(
  2216. SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
  2217. const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
  2218. SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
  2219. MachineFunction &MF = DAG.getMachineFunction();
  2220. const DataLayout &DL = DAG.getDataLayout();
  2221. auto PtrVT = getPointerTy(DAG.getDataLayout());
  2222. const Function *F = &MF.getFunction();
  2223. const AttributeList &PAL = F->getAttributes();
  2224. const TargetLowering *TLI = STI.getTargetLowering();
  2225. SDValue Root = DAG.getRoot();
  2226. std::vector<SDValue> OutChains;
  2227. bool isABI = (STI.getSmVersion() >= 20);
  2228. assert(isABI && "Non-ABI compilation is not supported");
  2229. if (!isABI)
  2230. return Chain;
  2231. std::vector<Type *> argTypes;
  2232. std::vector<const Argument *> theArgs;
  2233. for (const Argument &I : F->args()) {
  2234. theArgs.push_back(&I);
  2235. argTypes.push_back(I.getType());
  2236. }
  2237. // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
  2238. // Ins.size() will be larger
  2239. // * if there is an aggregate argument with multiple fields (each field
  2240. // showing up separately in Ins)
  2241. // * if there is a vector argument with more than typical vector-length
  2242. // elements (generally if more than 4) where each vector element is
  2243. // individually present in Ins.
  2244. // So a different index should be used for indexing into Ins.
  2245. // See similar issue in LowerCall.
  2246. unsigned InsIdx = 0;
  2247. int idx = 0;
  2248. for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
  2249. Type *Ty = argTypes[i];
  2250. // If the kernel argument is image*_t or sampler_t, convert it to
  2251. // a i32 constant holding the parameter position. This can later
  2252. // matched in the AsmPrinter to output the correct mangled name.
  2253. if (isImageOrSamplerVal(
  2254. theArgs[i],
  2255. (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
  2256. : nullptr))) {
  2257. assert(isKernelFunction(*F) &&
  2258. "Only kernels can have image/sampler params");
  2259. InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
  2260. continue;
  2261. }
  2262. if (theArgs[i]->use_empty()) {
  2263. // argument is dead
  2264. if (Ty->isAggregateType() || Ty->isIntegerTy(128)) {
  2265. SmallVector<EVT, 16> vtparts;
  2266. ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
  2267. assert(vtparts.size() > 0 && "empty aggregate type not expected");
  2268. for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
  2269. ++parti) {
  2270. InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
  2271. ++InsIdx;
  2272. }
  2273. if (vtparts.size() > 0)
  2274. --InsIdx;
  2275. continue;
  2276. }
  2277. if (Ty->isVectorTy()) {
  2278. EVT ObjectVT = getValueType(DL, Ty);
  2279. unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
  2280. for (unsigned parti = 0; parti < NumRegs; ++parti) {
  2281. InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
  2282. ++InsIdx;
  2283. }
  2284. if (NumRegs > 0)
  2285. --InsIdx;
  2286. continue;
  2287. }
  2288. InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
  2289. continue;
  2290. }
  2291. // In the following cases, assign a node order of "idx+1"
  2292. // to newly created nodes. The SDNodes for params have to
  2293. // appear in the same order as their order of appearance
  2294. // in the original function. "idx+1" holds that order.
  2295. if (!PAL.hasParamAttr(i, Attribute::ByVal)) {
  2296. bool aggregateIsPacked = false;
  2297. if (StructType *STy = dyn_cast<StructType>(Ty))
  2298. aggregateIsPacked = STy->isPacked();
  2299. SmallVector<EVT, 16> VTs;
  2300. SmallVector<uint64_t, 16> Offsets;
  2301. ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
  2302. assert(VTs.size() > 0 && "Unexpected empty type.");
  2303. auto VectorInfo =
  2304. VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty));
  2305. SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
  2306. int VecIdx = -1; // Index of the first element of the current vector.
  2307. for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
  2308. if (VectorInfo[parti] & PVF_FIRST) {
  2309. assert(VecIdx == -1 && "Orphaned vector.");
  2310. VecIdx = parti;
  2311. }
  2312. // That's the last element of this store op.
  2313. if (VectorInfo[parti] & PVF_LAST) {
  2314. unsigned NumElts = parti - VecIdx + 1;
  2315. EVT EltVT = VTs[parti];
  2316. // i1 is loaded/stored as i8.
  2317. EVT LoadVT = EltVT;
  2318. if (EltVT == MVT::i1)
  2319. LoadVT = MVT::i8;
  2320. else if (EltVT == MVT::v2f16)
  2321. // getLoad needs a vector type, but it can't handle
  2322. // vectors which contain v2f16 elements. So we must load
  2323. // using i32 here and then bitcast back.
  2324. LoadVT = MVT::i32;
  2325. EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
  2326. SDValue VecAddr =
  2327. DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
  2328. DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
  2329. Value *srcValue = Constant::getNullValue(PointerType::get(
  2330. EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
  2331. SDValue P =
  2332. DAG.getLoad(VecVT, dl, Root, VecAddr,
  2333. MachinePointerInfo(srcValue), aggregateIsPacked,
  2334. MachineMemOperand::MODereferenceable |
  2335. MachineMemOperand::MOInvariant);
  2336. if (P.getNode())
  2337. P.getNode()->setIROrder(idx + 1);
  2338. for (unsigned j = 0; j < NumElts; ++j) {
  2339. SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
  2340. DAG.getIntPtrConstant(j, dl));
  2341. // We've loaded i1 as an i8 and now must truncate it back to i1
  2342. if (EltVT == MVT::i1)
  2343. Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
  2344. // v2f16 was loaded as an i32. Now we must bitcast it back.
  2345. else if (EltVT == MVT::v2f16)
  2346. Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
  2347. // Extend the element if necessary (e.g. an i8 is loaded
  2348. // into an i16 register)
  2349. if (Ins[InsIdx].VT.isInteger() &&
  2350. Ins[InsIdx].VT.getFixedSizeInBits() >
  2351. LoadVT.getFixedSizeInBits()) {
  2352. unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
  2353. : ISD::ZERO_EXTEND;
  2354. Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
  2355. }
  2356. InVals.push_back(Elt);
  2357. }
  2358. // Reset vector tracking state.
  2359. VecIdx = -1;
  2360. }
  2361. ++InsIdx;
  2362. }
  2363. if (VTs.size() > 0)
  2364. --InsIdx;
  2365. continue;
  2366. }
  2367. // Param has ByVal attribute
  2368. // Return MoveParam(param symbol).
  2369. // Ideally, the param symbol can be returned directly,
  2370. // but when SDNode builder decides to use it in a CopyToReg(),
  2371. // machine instruction fails because TargetExternalSymbol
  2372. // (not lowered) is target dependent, and CopyToReg assumes
  2373. // the source is lowered.
  2374. EVT ObjectVT = getValueType(DL, Ty);
  2375. assert(ObjectVT == Ins[InsIdx].VT &&
  2376. "Ins type did not match function type");
  2377. SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
  2378. SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
  2379. if (p.getNode())
  2380. p.getNode()->setIROrder(idx + 1);
  2381. InVals.push_back(p);
  2382. }
  2383. // Clang will check explicit VarArg and issue error if any. However, Clang
  2384. // will let code with
  2385. // implicit var arg like f() pass. See bug 617733.
  2386. // We treat this case as if the arg list is empty.
  2387. // if (F.isVarArg()) {
  2388. // assert(0 && "VarArg not supported yet!");
  2389. //}
  2390. if (!OutChains.empty())
  2391. DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
  2392. return Chain;
  2393. }
  2394. SDValue
  2395. NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
  2396. bool isVarArg,
  2397. const SmallVectorImpl<ISD::OutputArg> &Outs,
  2398. const SmallVectorImpl<SDValue> &OutVals,
  2399. const SDLoc &dl, SelectionDAG &DAG) const {
  2400. MachineFunction &MF = DAG.getMachineFunction();
  2401. Type *RetTy = MF.getFunction().getReturnType();
  2402. bool isABI = (STI.getSmVersion() >= 20);
  2403. assert(isABI && "Non-ABI compilation is not supported");
  2404. if (!isABI)
  2405. return Chain;
  2406. const DataLayout &DL = DAG.getDataLayout();
  2407. SmallVector<EVT, 16> VTs;
  2408. SmallVector<uint64_t, 16> Offsets;
  2409. ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
  2410. assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
  2411. auto VectorInfo = VectorizePTXValueVTs(
  2412. VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlign(RetTy) : Align(1));
  2413. // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
  2414. // 32-bits are sign extended or zero extended, depending on whether
  2415. // they are signed or unsigned types.
  2416. bool ExtendIntegerRetVal =
  2417. RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
  2418. SmallVector<SDValue, 6> StoreOperands;
  2419. for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
  2420. // New load/store. Record chain and offset operands.
  2421. if (VectorInfo[i] & PVF_FIRST) {
  2422. assert(StoreOperands.empty() && "Orphaned operand list.");
  2423. StoreOperands.push_back(Chain);
  2424. StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
  2425. }
  2426. SDValue RetVal = OutVals[i];
  2427. if (ExtendIntegerRetVal) {
  2428. RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
  2429. : ISD::ZERO_EXTEND,
  2430. dl, MVT::i32, RetVal);
  2431. } else if (RetVal.getValueSizeInBits() < 16) {
  2432. // Use 16-bit registers for small load-stores as it's the
  2433. // smallest general purpose register size supported by NVPTX.
  2434. RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
  2435. }
  2436. // Record the value to return.
  2437. StoreOperands.push_back(RetVal);
  2438. // That's the last element of this store op.
  2439. if (VectorInfo[i] & PVF_LAST) {
  2440. NVPTXISD::NodeType Op;
  2441. unsigned NumElts = StoreOperands.size() - 2;
  2442. switch (NumElts) {
  2443. case 1:
  2444. Op = NVPTXISD::StoreRetval;
  2445. break;
  2446. case 2:
  2447. Op = NVPTXISD::StoreRetvalV2;
  2448. break;
  2449. case 4:
  2450. Op = NVPTXISD::StoreRetvalV4;
  2451. break;
  2452. default:
  2453. llvm_unreachable("Invalid vector info.");
  2454. }
  2455. // Adjust type of load/store op if we've extended the scalar
  2456. // return value.
  2457. EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
  2458. Chain = DAG.getMemIntrinsicNode(
  2459. Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,
  2460. MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
  2461. // Cleanup vector state.
  2462. StoreOperands.clear();
  2463. }
  2464. }
  2465. return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
  2466. }
  2467. void NVPTXTargetLowering::LowerAsmOperandForConstraint(
  2468. SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
  2469. SelectionDAG &DAG) const {
  2470. if (Constraint.length() > 1)
  2471. return;
  2472. else
  2473. TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
  2474. }
  2475. static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
  2476. switch (Intrinsic) {
  2477. default:
  2478. return 0;
  2479. case Intrinsic::nvvm_tex_1d_v4f32_s32:
  2480. return NVPTXISD::Tex1DFloatS32;
  2481. case Intrinsic::nvvm_tex_1d_v4f32_f32:
  2482. return NVPTXISD::Tex1DFloatFloat;
  2483. case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
  2484. return NVPTXISD::Tex1DFloatFloatLevel;
  2485. case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
  2486. return NVPTXISD::Tex1DFloatFloatGrad;
  2487. case Intrinsic::nvvm_tex_1d_v4s32_s32:
  2488. return NVPTXISD::Tex1DS32S32;
  2489. case Intrinsic::nvvm_tex_1d_v4s32_f32:
  2490. return NVPTXISD::Tex1DS32Float;
  2491. case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
  2492. return NVPTXISD::Tex1DS32FloatLevel;
  2493. case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
  2494. return NVPTXISD::Tex1DS32FloatGrad;
  2495. case Intrinsic::nvvm_tex_1d_v4u32_s32:
  2496. return NVPTXISD::Tex1DU32S32;
  2497. case Intrinsic::nvvm_tex_1d_v4u32_f32:
  2498. return NVPTXISD::Tex1DU32Float;
  2499. case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
  2500. return NVPTXISD::Tex1DU32FloatLevel;
  2501. case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
  2502. return NVPTXISD::Tex1DU32FloatGrad;
  2503. case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
  2504. return NVPTXISD::Tex1DArrayFloatS32;
  2505. case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
  2506. return NVPTXISD::Tex1DArrayFloatFloat;
  2507. case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
  2508. return NVPTXISD::Tex1DArrayFloatFloatLevel;
  2509. case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
  2510. return NVPTXISD::Tex1DArrayFloatFloatGrad;
  2511. case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
  2512. return NVPTXISD::Tex1DArrayS32S32;
  2513. case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
  2514. return NVPTXISD::Tex1DArrayS32Float;
  2515. case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
  2516. return NVPTXISD::Tex1DArrayS32FloatLevel;
  2517. case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
  2518. return NVPTXISD::Tex1DArrayS32FloatGrad;
  2519. case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
  2520. return NVPTXISD::Tex1DArrayU32S32;
  2521. case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
  2522. return NVPTXISD::Tex1DArrayU32Float;
  2523. case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
  2524. return NVPTXISD::Tex1DArrayU32FloatLevel;
  2525. case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
  2526. return NVPTXISD::Tex1DArrayU32FloatGrad;
  2527. case Intrinsic::nvvm_tex_2d_v4f32_s32:
  2528. return NVPTXISD::Tex2DFloatS32;
  2529. case Intrinsic::nvvm_tex_2d_v4f32_f32:
  2530. return NVPTXISD::Tex2DFloatFloat;
  2531. case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
  2532. return NVPTXISD::Tex2DFloatFloatLevel;
  2533. case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
  2534. return NVPTXISD::Tex2DFloatFloatGrad;
  2535. case Intrinsic::nvvm_tex_2d_v4s32_s32:
  2536. return NVPTXISD::Tex2DS32S32;
  2537. case Intrinsic::nvvm_tex_2d_v4s32_f32:
  2538. return NVPTXISD::Tex2DS32Float;
  2539. case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
  2540. return NVPTXISD::Tex2DS32FloatLevel;
  2541. case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
  2542. return NVPTXISD::Tex2DS32FloatGrad;
  2543. case Intrinsic::nvvm_tex_2d_v4u32_s32:
  2544. return NVPTXISD::Tex2DU32S32;
  2545. case Intrinsic::nvvm_tex_2d_v4u32_f32:
  2546. return NVPTXISD::Tex2DU32Float;
  2547. case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
  2548. return NVPTXISD::Tex2DU32FloatLevel;
  2549. case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
  2550. return NVPTXISD::Tex2DU32FloatGrad;
  2551. case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
  2552. return NVPTXISD::Tex2DArrayFloatS32;
  2553. case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
  2554. return NVPTXISD::Tex2DArrayFloatFloat;
  2555. case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
  2556. return NVPTXISD::Tex2DArrayFloatFloatLevel;
  2557. case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
  2558. return NVPTXISD::Tex2DArrayFloatFloatGrad;
  2559. case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
  2560. return NVPTXISD::Tex2DArrayS32S32;
  2561. case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
  2562. return NVPTXISD::Tex2DArrayS32Float;
  2563. case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
  2564. return NVPTXISD::Tex2DArrayS32FloatLevel;
  2565. case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
  2566. return NVPTXISD::Tex2DArrayS32FloatGrad;
  2567. case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
  2568. return NVPTXISD::Tex2DArrayU32S32;
  2569. case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
  2570. return NVPTXISD::Tex2DArrayU32Float;
  2571. case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
  2572. return NVPTXISD::Tex2DArrayU32FloatLevel;
  2573. case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
  2574. return NVPTXISD::Tex2DArrayU32FloatGrad;
  2575. case Intrinsic::nvvm_tex_3d_v4f32_s32:
  2576. return NVPTXISD::Tex3DFloatS32;
  2577. case Intrinsic::nvvm_tex_3d_v4f32_f32:
  2578. return NVPTXISD::Tex3DFloatFloat;
  2579. case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
  2580. return NVPTXISD::Tex3DFloatFloatLevel;
  2581. case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
  2582. return NVPTXISD::Tex3DFloatFloatGrad;
  2583. case Intrinsic::nvvm_tex_3d_v4s32_s32:
  2584. return NVPTXISD::Tex3DS32S32;
  2585. case Intrinsic::nvvm_tex_3d_v4s32_f32:
  2586. return NVPTXISD::Tex3DS32Float;
  2587. case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
  2588. return NVPTXISD::Tex3DS32FloatLevel;
  2589. case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
  2590. return NVPTXISD::Tex3DS32FloatGrad;
  2591. case Intrinsic::nvvm_tex_3d_v4u32_s32:
  2592. return NVPTXISD::Tex3DU32S32;
  2593. case Intrinsic::nvvm_tex_3d_v4u32_f32:
  2594. return NVPTXISD::Tex3DU32Float;
  2595. case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
  2596. return NVPTXISD::Tex3DU32FloatLevel;
  2597. case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
  2598. return NVPTXISD::Tex3DU32FloatGrad;
  2599. case Intrinsic::nvvm_tex_cube_v4f32_f32:
  2600. return NVPTXISD::TexCubeFloatFloat;
  2601. case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
  2602. return NVPTXISD::TexCubeFloatFloatLevel;
  2603. case Intrinsic::nvvm_tex_cube_v4s32_f32:
  2604. return NVPTXISD::TexCubeS32Float;
  2605. case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
  2606. return NVPTXISD::TexCubeS32FloatLevel;
  2607. case Intrinsic::nvvm_tex_cube_v4u32_f32:
  2608. return NVPTXISD::TexCubeU32Float;
  2609. case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
  2610. return NVPTXISD::TexCubeU32FloatLevel;
  2611. case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
  2612. return NVPTXISD::TexCubeArrayFloatFloat;
  2613. case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
  2614. return NVPTXISD::TexCubeArrayFloatFloatLevel;
  2615. case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
  2616. return NVPTXISD::TexCubeArrayS32Float;
  2617. case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
  2618. return NVPTXISD::TexCubeArrayS32FloatLevel;
  2619. case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
  2620. return NVPTXISD::TexCubeArrayU32Float;
  2621. case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
  2622. return NVPTXISD::TexCubeArrayU32FloatLevel;
  2623. case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
  2624. return NVPTXISD::Tld4R2DFloatFloat;
  2625. case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
  2626. return NVPTXISD::Tld4G2DFloatFloat;
  2627. case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
  2628. return NVPTXISD::Tld4B2DFloatFloat;
  2629. case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
  2630. return NVPTXISD::Tld4A2DFloatFloat;
  2631. case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
  2632. return NVPTXISD::Tld4R2DS64Float;
  2633. case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
  2634. return NVPTXISD::Tld4G2DS64Float;
  2635. case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
  2636. return NVPTXISD::Tld4B2DS64Float;
  2637. case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
  2638. return NVPTXISD::Tld4A2DS64Float;
  2639. case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
  2640. return NVPTXISD::Tld4R2DU64Float;
  2641. case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
  2642. return NVPTXISD::Tld4G2DU64Float;
  2643. case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
  2644. return NVPTXISD::Tld4B2DU64Float;
  2645. case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
  2646. return NVPTXISD::Tld4A2DU64Float;
  2647. case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
  2648. return NVPTXISD::TexUnified1DFloatS32;
  2649. case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
  2650. return NVPTXISD::TexUnified1DFloatFloat;
  2651. case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
  2652. return NVPTXISD::TexUnified1DFloatFloatLevel;
  2653. case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
  2654. return NVPTXISD::TexUnified1DFloatFloatGrad;
  2655. case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
  2656. return NVPTXISD::TexUnified1DS32S32;
  2657. case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
  2658. return NVPTXISD::TexUnified1DS32Float;
  2659. case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
  2660. return NVPTXISD::TexUnified1DS32FloatLevel;
  2661. case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
  2662. return NVPTXISD::TexUnified1DS32FloatGrad;
  2663. case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
  2664. return NVPTXISD::TexUnified1DU32S32;
  2665. case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
  2666. return NVPTXISD::TexUnified1DU32Float;
  2667. case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
  2668. return NVPTXISD::TexUnified1DU32FloatLevel;
  2669. case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
  2670. return NVPTXISD::TexUnified1DU32FloatGrad;
  2671. case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
  2672. return NVPTXISD::TexUnified1DArrayFloatS32;
  2673. case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
  2674. return NVPTXISD::TexUnified1DArrayFloatFloat;
  2675. case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
  2676. return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
  2677. case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
  2678. return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
  2679. case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
  2680. return NVPTXISD::TexUnified1DArrayS32S32;
  2681. case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
  2682. return NVPTXISD::TexUnified1DArrayS32Float;
  2683. case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
  2684. return NVPTXISD::TexUnified1DArrayS32FloatLevel;
  2685. case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
  2686. return NVPTXISD::TexUnified1DArrayS32FloatGrad;
  2687. case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
  2688. return NVPTXISD::TexUnified1DArrayU32S32;
  2689. case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
  2690. return NVPTXISD::TexUnified1DArrayU32Float;
  2691. case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
  2692. return NVPTXISD::TexUnified1DArrayU32FloatLevel;
  2693. case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
  2694. return NVPTXISD::TexUnified1DArrayU32FloatGrad;
  2695. case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
  2696. return NVPTXISD::TexUnified2DFloatS32;
  2697. case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
  2698. return NVPTXISD::TexUnified2DFloatFloat;
  2699. case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
  2700. return NVPTXISD::TexUnified2DFloatFloatLevel;
  2701. case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
  2702. return NVPTXISD::TexUnified2DFloatFloatGrad;
  2703. case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
  2704. return NVPTXISD::TexUnified2DS32S32;
  2705. case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
  2706. return NVPTXISD::TexUnified2DS32Float;
  2707. case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
  2708. return NVPTXISD::TexUnified2DS32FloatLevel;
  2709. case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
  2710. return NVPTXISD::TexUnified2DS32FloatGrad;
  2711. case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
  2712. return NVPTXISD::TexUnified2DU32S32;
  2713. case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
  2714. return NVPTXISD::TexUnified2DU32Float;
  2715. case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
  2716. return NVPTXISD::TexUnified2DU32FloatLevel;
  2717. case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
  2718. return NVPTXISD::TexUnified2DU32FloatGrad;
  2719. case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
  2720. return NVPTXISD::TexUnified2DArrayFloatS32;
  2721. case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
  2722. return NVPTXISD::TexUnified2DArrayFloatFloat;
  2723. case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
  2724. return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
  2725. case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
  2726. return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
  2727. case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
  2728. return NVPTXISD::TexUnified2DArrayS32S32;
  2729. case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
  2730. return NVPTXISD::TexUnified2DArrayS32Float;
  2731. case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
  2732. return NVPTXISD::TexUnified2DArrayS32FloatLevel;
  2733. case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
  2734. return NVPTXISD::TexUnified2DArrayS32FloatGrad;
  2735. case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
  2736. return NVPTXISD::TexUnified2DArrayU32S32;
  2737. case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
  2738. return NVPTXISD::TexUnified2DArrayU32Float;
  2739. case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
  2740. return NVPTXISD::TexUnified2DArrayU32FloatLevel;
  2741. case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
  2742. return NVPTXISD::TexUnified2DArrayU32FloatGrad;
  2743. case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
  2744. return NVPTXISD::TexUnified3DFloatS32;
  2745. case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
  2746. return NVPTXISD::TexUnified3DFloatFloat;
  2747. case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
  2748. return NVPTXISD::TexUnified3DFloatFloatLevel;
  2749. case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
  2750. return NVPTXISD::TexUnified3DFloatFloatGrad;
  2751. case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
  2752. return NVPTXISD::TexUnified3DS32S32;
  2753. case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
  2754. return NVPTXISD::TexUnified3DS32Float;
  2755. case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
  2756. return NVPTXISD::TexUnified3DS32FloatLevel;
  2757. case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
  2758. return NVPTXISD::TexUnified3DS32FloatGrad;
  2759. case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
  2760. return NVPTXISD::TexUnified3DU32S32;
  2761. case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
  2762. return NVPTXISD::TexUnified3DU32Float;
  2763. case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
  2764. return NVPTXISD::TexUnified3DU32FloatLevel;
  2765. case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
  2766. return NVPTXISD::TexUnified3DU32FloatGrad;
  2767. case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
  2768. return NVPTXISD::TexUnifiedCubeFloatFloat;
  2769. case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
  2770. return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
  2771. case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
  2772. return NVPTXISD::TexUnifiedCubeS32Float;
  2773. case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
  2774. return NVPTXISD::TexUnifiedCubeS32FloatLevel;
  2775. case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
  2776. return NVPTXISD::TexUnifiedCubeU32Float;
  2777. case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
  2778. return NVPTXISD::TexUnifiedCubeU32FloatLevel;
  2779. case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
  2780. return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
  2781. case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
  2782. return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
  2783. case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
  2784. return NVPTXISD::TexUnifiedCubeArrayS32Float;
  2785. case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
  2786. return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
  2787. case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
  2788. return NVPTXISD::TexUnifiedCubeArrayU32Float;
  2789. case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
  2790. return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
  2791. case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
  2792. return NVPTXISD::Tld4UnifiedR2DFloatFloat;
  2793. case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
  2794. return NVPTXISD::Tld4UnifiedG2DFloatFloat;
  2795. case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
  2796. return NVPTXISD::Tld4UnifiedB2DFloatFloat;
  2797. case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
  2798. return NVPTXISD::Tld4UnifiedA2DFloatFloat;
  2799. case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
  2800. return NVPTXISD::Tld4UnifiedR2DS64Float;
  2801. case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
  2802. return NVPTXISD::Tld4UnifiedG2DS64Float;
  2803. case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
  2804. return NVPTXISD::Tld4UnifiedB2DS64Float;
  2805. case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
  2806. return NVPTXISD::Tld4UnifiedA2DS64Float;
  2807. case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
  2808. return NVPTXISD::Tld4UnifiedR2DU64Float;
  2809. case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
  2810. return NVPTXISD::Tld4UnifiedG2DU64Float;
  2811. case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
  2812. return NVPTXISD::Tld4UnifiedB2DU64Float;
  2813. case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
  2814. return NVPTXISD::Tld4UnifiedA2DU64Float;
  2815. }
  2816. }
  2817. static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
  2818. switch (Intrinsic) {
  2819. default:
  2820. return 0;
  2821. case Intrinsic::nvvm_suld_1d_i8_clamp:
  2822. return NVPTXISD::Suld1DI8Clamp;
  2823. case Intrinsic::nvvm_suld_1d_i16_clamp:
  2824. return NVPTXISD::Suld1DI16Clamp;
  2825. case Intrinsic::nvvm_suld_1d_i32_clamp:
  2826. return NVPTXISD::Suld1DI32Clamp;
  2827. case Intrinsic::nvvm_suld_1d_i64_clamp:
  2828. return NVPTXISD::Suld1DI64Clamp;
  2829. case Intrinsic::nvvm_suld_1d_v2i8_clamp:
  2830. return NVPTXISD::Suld1DV2I8Clamp;
  2831. case Intrinsic::nvvm_suld_1d_v2i16_clamp:
  2832. return NVPTXISD::Suld1DV2I16Clamp;
  2833. case Intrinsic::nvvm_suld_1d_v2i32_clamp:
  2834. return NVPTXISD::Suld1DV2I32Clamp;
  2835. case Intrinsic::nvvm_suld_1d_v2i64_clamp:
  2836. return NVPTXISD::Suld1DV2I64Clamp;
  2837. case Intrinsic::nvvm_suld_1d_v4i8_clamp:
  2838. return NVPTXISD::Suld1DV4I8Clamp;
  2839. case Intrinsic::nvvm_suld_1d_v4i16_clamp:
  2840. return NVPTXISD::Suld1DV4I16Clamp;
  2841. case Intrinsic::nvvm_suld_1d_v4i32_clamp:
  2842. return NVPTXISD::Suld1DV4I32Clamp;
  2843. case Intrinsic::nvvm_suld_1d_array_i8_clamp:
  2844. return NVPTXISD::Suld1DArrayI8Clamp;
  2845. case Intrinsic::nvvm_suld_1d_array_i16_clamp:
  2846. return NVPTXISD::Suld1DArrayI16Clamp;
  2847. case Intrinsic::nvvm_suld_1d_array_i32_clamp:
  2848. return NVPTXISD::Suld1DArrayI32Clamp;
  2849. case Intrinsic::nvvm_suld_1d_array_i64_clamp:
  2850. return NVPTXISD::Suld1DArrayI64Clamp;
  2851. case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
  2852. return NVPTXISD::Suld1DArrayV2I8Clamp;
  2853. case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
  2854. return NVPTXISD::Suld1DArrayV2I16Clamp;
  2855. case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
  2856. return NVPTXISD::Suld1DArrayV2I32Clamp;
  2857. case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
  2858. return NVPTXISD::Suld1DArrayV2I64Clamp;
  2859. case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
  2860. return NVPTXISD::Suld1DArrayV4I8Clamp;
  2861. case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
  2862. return NVPTXISD::Suld1DArrayV4I16Clamp;
  2863. case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
  2864. return NVPTXISD::Suld1DArrayV4I32Clamp;
  2865. case Intrinsic::nvvm_suld_2d_i8_clamp:
  2866. return NVPTXISD::Suld2DI8Clamp;
  2867. case Intrinsic::nvvm_suld_2d_i16_clamp:
  2868. return NVPTXISD::Suld2DI16Clamp;
  2869. case Intrinsic::nvvm_suld_2d_i32_clamp:
  2870. return NVPTXISD::Suld2DI32Clamp;
  2871. case Intrinsic::nvvm_suld_2d_i64_clamp:
  2872. return NVPTXISD::Suld2DI64Clamp;
  2873. case Intrinsic::nvvm_suld_2d_v2i8_clamp:
  2874. return NVPTXISD::Suld2DV2I8Clamp;
  2875. case Intrinsic::nvvm_suld_2d_v2i16_clamp:
  2876. return NVPTXISD::Suld2DV2I16Clamp;
  2877. case Intrinsic::nvvm_suld_2d_v2i32_clamp:
  2878. return NVPTXISD::Suld2DV2I32Clamp;
  2879. case Intrinsic::nvvm_suld_2d_v2i64_clamp:
  2880. return NVPTXISD::Suld2DV2I64Clamp;
  2881. case Intrinsic::nvvm_suld_2d_v4i8_clamp:
  2882. return NVPTXISD::Suld2DV4I8Clamp;
  2883. case Intrinsic::nvvm_suld_2d_v4i16_clamp:
  2884. return NVPTXISD::Suld2DV4I16Clamp;
  2885. case Intrinsic::nvvm_suld_2d_v4i32_clamp:
  2886. return NVPTXISD::Suld2DV4I32Clamp;
  2887. case Intrinsic::nvvm_suld_2d_array_i8_clamp:
  2888. return NVPTXISD::Suld2DArrayI8Clamp;
  2889. case Intrinsic::nvvm_suld_2d_array_i16_clamp:
  2890. return NVPTXISD::Suld2DArrayI16Clamp;
  2891. case Intrinsic::nvvm_suld_2d_array_i32_clamp:
  2892. return NVPTXISD::Suld2DArrayI32Clamp;
  2893. case Intrinsic::nvvm_suld_2d_array_i64_clamp:
  2894. return NVPTXISD::Suld2DArrayI64Clamp;
  2895. case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
  2896. return NVPTXISD::Suld2DArrayV2I8Clamp;
  2897. case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
  2898. return NVPTXISD::Suld2DArrayV2I16Clamp;
  2899. case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
  2900. return NVPTXISD::Suld2DArrayV2I32Clamp;
  2901. case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
  2902. return NVPTXISD::Suld2DArrayV2I64Clamp;
  2903. case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
  2904. return NVPTXISD::Suld2DArrayV4I8Clamp;
  2905. case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
  2906. return NVPTXISD::Suld2DArrayV4I16Clamp;
  2907. case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
  2908. return NVPTXISD::Suld2DArrayV4I32Clamp;
  2909. case Intrinsic::nvvm_suld_3d_i8_clamp:
  2910. return NVPTXISD::Suld3DI8Clamp;
  2911. case Intrinsic::nvvm_suld_3d_i16_clamp:
  2912. return NVPTXISD::Suld3DI16Clamp;
  2913. case Intrinsic::nvvm_suld_3d_i32_clamp:
  2914. return NVPTXISD::Suld3DI32Clamp;
  2915. case Intrinsic::nvvm_suld_3d_i64_clamp:
  2916. return NVPTXISD::Suld3DI64Clamp;
  2917. case Intrinsic::nvvm_suld_3d_v2i8_clamp:
  2918. return NVPTXISD::Suld3DV2I8Clamp;
  2919. case Intrinsic::nvvm_suld_3d_v2i16_clamp:
  2920. return NVPTXISD::Suld3DV2I16Clamp;
  2921. case Intrinsic::nvvm_suld_3d_v2i32_clamp:
  2922. return NVPTXISD::Suld3DV2I32Clamp;
  2923. case Intrinsic::nvvm_suld_3d_v2i64_clamp:
  2924. return NVPTXISD::Suld3DV2I64Clamp;
  2925. case Intrinsic::nvvm_suld_3d_v4i8_clamp:
  2926. return NVPTXISD::Suld3DV4I8Clamp;
  2927. case Intrinsic::nvvm_suld_3d_v4i16_clamp:
  2928. return NVPTXISD::Suld3DV4I16Clamp;
  2929. case Intrinsic::nvvm_suld_3d_v4i32_clamp:
  2930. return NVPTXISD::Suld3DV4I32Clamp;
  2931. case Intrinsic::nvvm_suld_1d_i8_trap:
  2932. return NVPTXISD::Suld1DI8Trap;
  2933. case Intrinsic::nvvm_suld_1d_i16_trap:
  2934. return NVPTXISD::Suld1DI16Trap;
  2935. case Intrinsic::nvvm_suld_1d_i32_trap:
  2936. return NVPTXISD::Suld1DI32Trap;
  2937. case Intrinsic::nvvm_suld_1d_i64_trap:
  2938. return NVPTXISD::Suld1DI64Trap;
  2939. case Intrinsic::nvvm_suld_1d_v2i8_trap:
  2940. return NVPTXISD::Suld1DV2I8Trap;
  2941. case Intrinsic::nvvm_suld_1d_v2i16_trap:
  2942. return NVPTXISD::Suld1DV2I16Trap;
  2943. case Intrinsic::nvvm_suld_1d_v2i32_trap:
  2944. return NVPTXISD::Suld1DV2I32Trap;
  2945. case Intrinsic::nvvm_suld_1d_v2i64_trap:
  2946. return NVPTXISD::Suld1DV2I64Trap;
  2947. case Intrinsic::nvvm_suld_1d_v4i8_trap:
  2948. return NVPTXISD::Suld1DV4I8Trap;
  2949. case Intrinsic::nvvm_suld_1d_v4i16_trap:
  2950. return NVPTXISD::Suld1DV4I16Trap;
  2951. case Intrinsic::nvvm_suld_1d_v4i32_trap:
  2952. return NVPTXISD::Suld1DV4I32Trap;
  2953. case Intrinsic::nvvm_suld_1d_array_i8_trap:
  2954. return NVPTXISD::Suld1DArrayI8Trap;
  2955. case Intrinsic::nvvm_suld_1d_array_i16_trap:
  2956. return NVPTXISD::Suld1DArrayI16Trap;
  2957. case Intrinsic::nvvm_suld_1d_array_i32_trap:
  2958. return NVPTXISD::Suld1DArrayI32Trap;
  2959. case Intrinsic::nvvm_suld_1d_array_i64_trap:
  2960. return NVPTXISD::Suld1DArrayI64Trap;
  2961. case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
  2962. return NVPTXISD::Suld1DArrayV2I8Trap;
  2963. case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
  2964. return NVPTXISD::Suld1DArrayV2I16Trap;
  2965. case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
  2966. return NVPTXISD::Suld1DArrayV2I32Trap;
  2967. case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
  2968. return NVPTXISD::Suld1DArrayV2I64Trap;
  2969. case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
  2970. return NVPTXISD::Suld1DArrayV4I8Trap;
  2971. case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
  2972. return NVPTXISD::Suld1DArrayV4I16Trap;
  2973. case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
  2974. return NVPTXISD::Suld1DArrayV4I32Trap;
  2975. case Intrinsic::nvvm_suld_2d_i8_trap:
  2976. return NVPTXISD::Suld2DI8Trap;
  2977. case Intrinsic::nvvm_suld_2d_i16_trap:
  2978. return NVPTXISD::Suld2DI16Trap;
  2979. case Intrinsic::nvvm_suld_2d_i32_trap:
  2980. return NVPTXISD::Suld2DI32Trap;
  2981. case Intrinsic::nvvm_suld_2d_i64_trap:
  2982. return NVPTXISD::Suld2DI64Trap;
  2983. case Intrinsic::nvvm_suld_2d_v2i8_trap:
  2984. return NVPTXISD::Suld2DV2I8Trap;
  2985. case Intrinsic::nvvm_suld_2d_v2i16_trap:
  2986. return NVPTXISD::Suld2DV2I16Trap;
  2987. case Intrinsic::nvvm_suld_2d_v2i32_trap:
  2988. return NVPTXISD::Suld2DV2I32Trap;
  2989. case Intrinsic::nvvm_suld_2d_v2i64_trap:
  2990. return NVPTXISD::Suld2DV2I64Trap;
  2991. case Intrinsic::nvvm_suld_2d_v4i8_trap:
  2992. return NVPTXISD::Suld2DV4I8Trap;
  2993. case Intrinsic::nvvm_suld_2d_v4i16_trap:
  2994. return NVPTXISD::Suld2DV4I16Trap;
  2995. case Intrinsic::nvvm_suld_2d_v4i32_trap:
  2996. return NVPTXISD::Suld2DV4I32Trap;
  2997. case Intrinsic::nvvm_suld_2d_array_i8_trap:
  2998. return NVPTXISD::Suld2DArrayI8Trap;
  2999. case Intrinsic::nvvm_suld_2d_array_i16_trap:
  3000. return NVPTXISD::Suld2DArrayI16Trap;
  3001. case Intrinsic::nvvm_suld_2d_array_i32_trap:
  3002. return NVPTXISD::Suld2DArrayI32Trap;
  3003. case Intrinsic::nvvm_suld_2d_array_i64_trap:
  3004. return NVPTXISD::Suld2DArrayI64Trap;
  3005. case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
  3006. return NVPTXISD::Suld2DArrayV2I8Trap;
  3007. case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
  3008. return NVPTXISD::Suld2DArrayV2I16Trap;
  3009. case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
  3010. return NVPTXISD::Suld2DArrayV2I32Trap;
  3011. case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
  3012. return NVPTXISD::Suld2DArrayV2I64Trap;
  3013. case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
  3014. return NVPTXISD::Suld2DArrayV4I8Trap;
  3015. case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
  3016. return NVPTXISD::Suld2DArrayV4I16Trap;
  3017. case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
  3018. return NVPTXISD::Suld2DArrayV4I32Trap;
  3019. case Intrinsic::nvvm_suld_3d_i8_trap:
  3020. return NVPTXISD::Suld3DI8Trap;
  3021. case Intrinsic::nvvm_suld_3d_i16_trap:
  3022. return NVPTXISD::Suld3DI16Trap;
  3023. case Intrinsic::nvvm_suld_3d_i32_trap:
  3024. return NVPTXISD::Suld3DI32Trap;
  3025. case Intrinsic::nvvm_suld_3d_i64_trap:
  3026. return NVPTXISD::Suld3DI64Trap;
  3027. case Intrinsic::nvvm_suld_3d_v2i8_trap:
  3028. return NVPTXISD::Suld3DV2I8Trap;
  3029. case Intrinsic::nvvm_suld_3d_v2i16_trap:
  3030. return NVPTXISD::Suld3DV2I16Trap;
  3031. case Intrinsic::nvvm_suld_3d_v2i32_trap:
  3032. return NVPTXISD::Suld3DV2I32Trap;
  3033. case Intrinsic::nvvm_suld_3d_v2i64_trap:
  3034. return NVPTXISD::Suld3DV2I64Trap;
  3035. case Intrinsic::nvvm_suld_3d_v4i8_trap:
  3036. return NVPTXISD::Suld3DV4I8Trap;
  3037. case Intrinsic::nvvm_suld_3d_v4i16_trap:
  3038. return NVPTXISD::Suld3DV4I16Trap;
  3039. case Intrinsic::nvvm_suld_3d_v4i32_trap:
  3040. return NVPTXISD::Suld3DV4I32Trap;
  3041. case Intrinsic::nvvm_suld_1d_i8_zero:
  3042. return NVPTXISD::Suld1DI8Zero;
  3043. case Intrinsic::nvvm_suld_1d_i16_zero:
  3044. return NVPTXISD::Suld1DI16Zero;
  3045. case Intrinsic::nvvm_suld_1d_i32_zero:
  3046. return NVPTXISD::Suld1DI32Zero;
  3047. case Intrinsic::nvvm_suld_1d_i64_zero:
  3048. return NVPTXISD::Suld1DI64Zero;
  3049. case Intrinsic::nvvm_suld_1d_v2i8_zero:
  3050. return NVPTXISD::Suld1DV2I8Zero;
  3051. case Intrinsic::nvvm_suld_1d_v2i16_zero:
  3052. return NVPTXISD::Suld1DV2I16Zero;
  3053. case Intrinsic::nvvm_suld_1d_v2i32_zero:
  3054. return NVPTXISD::Suld1DV2I32Zero;
  3055. case Intrinsic::nvvm_suld_1d_v2i64_zero:
  3056. return NVPTXISD::Suld1DV2I64Zero;
  3057. case Intrinsic::nvvm_suld_1d_v4i8_zero:
  3058. return NVPTXISD::Suld1DV4I8Zero;
  3059. case Intrinsic::nvvm_suld_1d_v4i16_zero:
  3060. return NVPTXISD::Suld1DV4I16Zero;
  3061. case Intrinsic::nvvm_suld_1d_v4i32_zero:
  3062. return NVPTXISD::Suld1DV4I32Zero;
  3063. case Intrinsic::nvvm_suld_1d_array_i8_zero:
  3064. return NVPTXISD::Suld1DArrayI8Zero;
  3065. case Intrinsic::nvvm_suld_1d_array_i16_zero:
  3066. return NVPTXISD::Suld1DArrayI16Zero;
  3067. case Intrinsic::nvvm_suld_1d_array_i32_zero:
  3068. return NVPTXISD::Suld1DArrayI32Zero;
  3069. case Intrinsic::nvvm_suld_1d_array_i64_zero:
  3070. return NVPTXISD::Suld1DArrayI64Zero;
  3071. case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
  3072. return NVPTXISD::Suld1DArrayV2I8Zero;
  3073. case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
  3074. return NVPTXISD::Suld1DArrayV2I16Zero;
  3075. case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
  3076. return NVPTXISD::Suld1DArrayV2I32Zero;
  3077. case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
  3078. return NVPTXISD::Suld1DArrayV2I64Zero;
  3079. case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
  3080. return NVPTXISD::Suld1DArrayV4I8Zero;
  3081. case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
  3082. return NVPTXISD::Suld1DArrayV4I16Zero;
  3083. case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
  3084. return NVPTXISD::Suld1DArrayV4I32Zero;
  3085. case Intrinsic::nvvm_suld_2d_i8_zero:
  3086. return NVPTXISD::Suld2DI8Zero;
  3087. case Intrinsic::nvvm_suld_2d_i16_zero:
  3088. return NVPTXISD::Suld2DI16Zero;
  3089. case Intrinsic::nvvm_suld_2d_i32_zero:
  3090. return NVPTXISD::Suld2DI32Zero;
  3091. case Intrinsic::nvvm_suld_2d_i64_zero:
  3092. return NVPTXISD::Suld2DI64Zero;
  3093. case Intrinsic::nvvm_suld_2d_v2i8_zero:
  3094. return NVPTXISD::Suld2DV2I8Zero;
  3095. case Intrinsic::nvvm_suld_2d_v2i16_zero:
  3096. return NVPTXISD::Suld2DV2I16Zero;
  3097. case Intrinsic::nvvm_suld_2d_v2i32_zero:
  3098. return NVPTXISD::Suld2DV2I32Zero;
  3099. case Intrinsic::nvvm_suld_2d_v2i64_zero:
  3100. return NVPTXISD::Suld2DV2I64Zero;
  3101. case Intrinsic::nvvm_suld_2d_v4i8_zero:
  3102. return NVPTXISD::Suld2DV4I8Zero;
  3103. case Intrinsic::nvvm_suld_2d_v4i16_zero:
  3104. return NVPTXISD::Suld2DV4I16Zero;
  3105. case Intrinsic::nvvm_suld_2d_v4i32_zero:
  3106. return NVPTXISD::Suld2DV4I32Zero;
  3107. case Intrinsic::nvvm_suld_2d_array_i8_zero:
  3108. return NVPTXISD::Suld2DArrayI8Zero;
  3109. case Intrinsic::nvvm_suld_2d_array_i16_zero:
  3110. return NVPTXISD::Suld2DArrayI16Zero;
  3111. case Intrinsic::nvvm_suld_2d_array_i32_zero:
  3112. return NVPTXISD::Suld2DArrayI32Zero;
  3113. case Intrinsic::nvvm_suld_2d_array_i64_zero:
  3114. return NVPTXISD::Suld2DArrayI64Zero;
  3115. case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
  3116. return NVPTXISD::Suld2DArrayV2I8Zero;
  3117. case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
  3118. return NVPTXISD::Suld2DArrayV2I16Zero;
  3119. case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
  3120. return NVPTXISD::Suld2DArrayV2I32Zero;
  3121. case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
  3122. return NVPTXISD::Suld2DArrayV2I64Zero;
  3123. case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
  3124. return NVPTXISD::Suld2DArrayV4I8Zero;
  3125. case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
  3126. return NVPTXISD::Suld2DArrayV4I16Zero;
  3127. case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
  3128. return NVPTXISD::Suld2DArrayV4I32Zero;
  3129. case Intrinsic::nvvm_suld_3d_i8_zero:
  3130. return NVPTXISD::Suld3DI8Zero;
  3131. case Intrinsic::nvvm_suld_3d_i16_zero:
  3132. return NVPTXISD::Suld3DI16Zero;
  3133. case Intrinsic::nvvm_suld_3d_i32_zero:
  3134. return NVPTXISD::Suld3DI32Zero;
  3135. case Intrinsic::nvvm_suld_3d_i64_zero:
  3136. return NVPTXISD::Suld3DI64Zero;
  3137. case Intrinsic::nvvm_suld_3d_v2i8_zero:
  3138. return NVPTXISD::Suld3DV2I8Zero;
  3139. case Intrinsic::nvvm_suld_3d_v2i16_zero:
  3140. return NVPTXISD::Suld3DV2I16Zero;
  3141. case Intrinsic::nvvm_suld_3d_v2i32_zero:
  3142. return NVPTXISD::Suld3DV2I32Zero;
  3143. case Intrinsic::nvvm_suld_3d_v2i64_zero:
  3144. return NVPTXISD::Suld3DV2I64Zero;
  3145. case Intrinsic::nvvm_suld_3d_v4i8_zero:
  3146. return NVPTXISD::Suld3DV4I8Zero;
  3147. case Intrinsic::nvvm_suld_3d_v4i16_zero:
  3148. return NVPTXISD::Suld3DV4I16Zero;
  3149. case Intrinsic::nvvm_suld_3d_v4i32_zero:
  3150. return NVPTXISD::Suld3DV4I32Zero;
  3151. }
  3152. }
  3153. // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
  3154. // TgtMemIntrinsic
  3155. // because we need the information that is only available in the "Value" type
  3156. // of destination
  3157. // pointer. In particular, the address space information.
  3158. bool NVPTXTargetLowering::getTgtMemIntrinsic(
  3159. IntrinsicInfo &Info, const CallInst &I,
  3160. MachineFunction &MF, unsigned Intrinsic) const {
  3161. switch (Intrinsic) {
  3162. default:
  3163. return false;
  3164. case Intrinsic::nvvm_match_all_sync_i32p:
  3165. case Intrinsic::nvvm_match_all_sync_i64p:
  3166. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3167. // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
  3168. // in order to model data exchange with other threads, but perform no real
  3169. // memory accesses.
  3170. Info.memVT = MVT::i1;
  3171. // Our result depends on both our and other thread's arguments.
  3172. Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
  3173. return true;
  3174. case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
  3175. case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
  3176. case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
  3177. case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
  3178. case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
  3179. case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
  3180. case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
  3181. case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
  3182. case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
  3183. case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
  3184. case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
  3185. case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
  3186. case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
  3187. case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
  3188. case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
  3189. case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
  3190. case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
  3191. case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
  3192. case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
  3193. case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
  3194. case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
  3195. case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
  3196. case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
  3197. case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
  3198. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3199. Info.memVT = MVT::v8f16;
  3200. Info.ptrVal = I.getArgOperand(0);
  3201. Info.offset = 0;
  3202. Info.flags = MachineMemOperand::MOLoad;
  3203. Info.align = Align(16);
  3204. return true;
  3205. }
  3206. case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
  3207. case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
  3208. case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
  3209. case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
  3210. case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
  3211. case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
  3212. case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
  3213. case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
  3214. case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
  3215. case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
  3216. case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
  3217. case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
  3218. case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
  3219. case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
  3220. case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
  3221. case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
  3222. case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
  3223. case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
  3224. case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
  3225. case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
  3226. case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
  3227. case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
  3228. case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
  3229. case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
  3230. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3231. Info.memVT = MVT::v2i32;
  3232. Info.ptrVal = I.getArgOperand(0);
  3233. Info.offset = 0;
  3234. Info.flags = MachineMemOperand::MOLoad;
  3235. Info.align = Align(8);
  3236. return true;
  3237. }
  3238. case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
  3239. case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
  3240. case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
  3241. case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
  3242. case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
  3243. case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
  3244. case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
  3245. case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
  3246. case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
  3247. case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
  3248. case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
  3249. case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
  3250. case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
  3251. case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
  3252. case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
  3253. case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
  3254. case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
  3255. case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
  3256. case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
  3257. case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
  3258. case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
  3259. case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
  3260. case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
  3261. case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
  3262. case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
  3263. case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
  3264. case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
  3265. case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
  3266. case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
  3267. case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
  3268. case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
  3269. case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
  3270. case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
  3271. case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {
  3272. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3273. Info.memVT = MVT::v4i32;
  3274. Info.ptrVal = I.getArgOperand(0);
  3275. Info.offset = 0;
  3276. Info.flags = MachineMemOperand::MOLoad;
  3277. Info.align = Align(16);
  3278. return true;
  3279. }
  3280. case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
  3281. case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
  3282. case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
  3283. case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
  3284. case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
  3285. case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
  3286. case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
  3287. case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
  3288. case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
  3289. case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
  3290. case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
  3291. case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
  3292. case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
  3293. case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
  3294. case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
  3295. case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
  3296. case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
  3297. case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
  3298. case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
  3299. case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
  3300. case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
  3301. case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
  3302. case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
  3303. case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
  3304. case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
  3305. case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
  3306. case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
  3307. case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
  3308. case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
  3309. case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {
  3310. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3311. Info.memVT = MVT::i32;
  3312. Info.ptrVal = I.getArgOperand(0);
  3313. Info.offset = 0;
  3314. Info.flags = MachineMemOperand::MOLoad;
  3315. Info.align = Align(4);
  3316. return true;
  3317. }
  3318. case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
  3319. case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
  3320. case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
  3321. case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
  3322. case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
  3323. case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
  3324. case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
  3325. case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
  3326. case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
  3327. case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
  3328. case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
  3329. case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
  3330. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3331. Info.memVT = MVT::v4f16;
  3332. Info.ptrVal = I.getArgOperand(0);
  3333. Info.offset = 0;
  3334. Info.flags = MachineMemOperand::MOLoad;
  3335. Info.align = Align(16);
  3336. return true;
  3337. }
  3338. case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
  3339. case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
  3340. case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
  3341. case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
  3342. case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
  3343. case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
  3344. case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
  3345. case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
  3346. case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
  3347. case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
  3348. case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
  3349. case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
  3350. case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
  3351. case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
  3352. case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
  3353. case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
  3354. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3355. Info.memVT = MVT::v8f32;
  3356. Info.ptrVal = I.getArgOperand(0);
  3357. Info.offset = 0;
  3358. Info.flags = MachineMemOperand::MOLoad;
  3359. Info.align = Align(16);
  3360. return true;
  3361. }
  3362. case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
  3363. case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
  3364. case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
  3365. case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
  3366. case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
  3367. case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
  3368. case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
  3369. case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
  3370. case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
  3371. case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
  3372. case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
  3373. case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
  3374. case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
  3375. case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
  3376. case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
  3377. case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
  3378. case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
  3379. case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
  3380. case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
  3381. case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
  3382. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3383. Info.memVT = MVT::v8i32;
  3384. Info.ptrVal = I.getArgOperand(0);
  3385. Info.offset = 0;
  3386. Info.flags = MachineMemOperand::MOLoad;
  3387. Info.align = Align(16);
  3388. return true;
  3389. }
  3390. case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
  3391. case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
  3392. case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
  3393. case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
  3394. case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
  3395. case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
  3396. case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
  3397. case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
  3398. case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
  3399. case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {
  3400. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3401. Info.memVT = MVT::v2i32;
  3402. Info.ptrVal = I.getArgOperand(0);
  3403. Info.offset = 0;
  3404. Info.flags = MachineMemOperand::MOLoad;
  3405. Info.align = Align(8);
  3406. return true;
  3407. }
  3408. case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
  3409. case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
  3410. case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
  3411. case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
  3412. case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
  3413. case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
  3414. case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
  3415. case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
  3416. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3417. Info.memVT = MVT::f64;
  3418. Info.ptrVal = I.getArgOperand(0);
  3419. Info.offset = 0;
  3420. Info.flags = MachineMemOperand::MOLoad;
  3421. Info.align = Align(8);
  3422. return true;
  3423. }
  3424. case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
  3425. case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
  3426. case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
  3427. case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
  3428. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3429. Info.memVT = MVT::v2f64;
  3430. Info.ptrVal = I.getArgOperand(0);
  3431. Info.offset = 0;
  3432. Info.flags = MachineMemOperand::MOLoad;
  3433. Info.align = Align(16);
  3434. return true;
  3435. }
  3436. case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
  3437. case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
  3438. case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
  3439. case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
  3440. case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
  3441. case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
  3442. case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
  3443. case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
  3444. case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
  3445. case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
  3446. case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
  3447. case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
  3448. Info.opc = ISD::INTRINSIC_VOID;
  3449. Info.memVT = MVT::v4f16;
  3450. Info.ptrVal = I.getArgOperand(0);
  3451. Info.offset = 0;
  3452. Info.flags = MachineMemOperand::MOStore;
  3453. Info.align = Align(16);
  3454. return true;
  3455. }
  3456. case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
  3457. case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
  3458. case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
  3459. case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
  3460. case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
  3461. case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
  3462. case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
  3463. case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
  3464. case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
  3465. case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
  3466. case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
  3467. case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
  3468. case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
  3469. case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
  3470. case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
  3471. case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
  3472. Info.opc = ISD::INTRINSIC_VOID;
  3473. Info.memVT = MVT::v8f32;
  3474. Info.ptrVal = I.getArgOperand(0);
  3475. Info.offset = 0;
  3476. Info.flags = MachineMemOperand::MOStore;
  3477. Info.align = Align(16);
  3478. return true;
  3479. }
  3480. case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
  3481. case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
  3482. case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
  3483. case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
  3484. case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
  3485. case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
  3486. case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
  3487. case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
  3488. case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
  3489. case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
  3490. case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
  3491. case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
  3492. Info.opc = ISD::INTRINSIC_VOID;
  3493. Info.memVT = MVT::v8i32;
  3494. Info.ptrVal = I.getArgOperand(0);
  3495. Info.offset = 0;
  3496. Info.flags = MachineMemOperand::MOStore;
  3497. Info.align = Align(16);
  3498. return true;
  3499. }
  3500. case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
  3501. case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
  3502. case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
  3503. case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
  3504. case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
  3505. case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
  3506. case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
  3507. case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
  3508. Info.opc = ISD::INTRINSIC_VOID;
  3509. Info.memVT = MVT::v2i32;
  3510. Info.ptrVal = I.getArgOperand(0);
  3511. Info.offset = 0;
  3512. Info.flags = MachineMemOperand::MOStore;
  3513. Info.align = Align(8);
  3514. return true;
  3515. }
  3516. case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
  3517. case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
  3518. case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
  3519. case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
  3520. Info.opc = ISD::INTRINSIC_VOID;
  3521. Info.memVT = MVT::v2f64;
  3522. Info.ptrVal = I.getArgOperand(0);
  3523. Info.offset = 0;
  3524. Info.flags = MachineMemOperand::MOStore;
  3525. Info.align = Align(16);
  3526. return true;
  3527. }
  3528. case Intrinsic::nvvm_atomic_load_inc_32:
  3529. case Intrinsic::nvvm_atomic_load_dec_32:
  3530. case Intrinsic::nvvm_atomic_add_gen_f_cta:
  3531. case Intrinsic::nvvm_atomic_add_gen_f_sys:
  3532. case Intrinsic::nvvm_atomic_add_gen_i_cta:
  3533. case Intrinsic::nvvm_atomic_add_gen_i_sys:
  3534. case Intrinsic::nvvm_atomic_and_gen_i_cta:
  3535. case Intrinsic::nvvm_atomic_and_gen_i_sys:
  3536. case Intrinsic::nvvm_atomic_cas_gen_i_cta:
  3537. case Intrinsic::nvvm_atomic_cas_gen_i_sys:
  3538. case Intrinsic::nvvm_atomic_dec_gen_i_cta:
  3539. case Intrinsic::nvvm_atomic_dec_gen_i_sys:
  3540. case Intrinsic::nvvm_atomic_inc_gen_i_cta:
  3541. case Intrinsic::nvvm_atomic_inc_gen_i_sys:
  3542. case Intrinsic::nvvm_atomic_max_gen_i_cta:
  3543. case Intrinsic::nvvm_atomic_max_gen_i_sys:
  3544. case Intrinsic::nvvm_atomic_min_gen_i_cta:
  3545. case Intrinsic::nvvm_atomic_min_gen_i_sys:
  3546. case Intrinsic::nvvm_atomic_or_gen_i_cta:
  3547. case Intrinsic::nvvm_atomic_or_gen_i_sys:
  3548. case Intrinsic::nvvm_atomic_exch_gen_i_cta:
  3549. case Intrinsic::nvvm_atomic_exch_gen_i_sys:
  3550. case Intrinsic::nvvm_atomic_xor_gen_i_cta:
  3551. case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
  3552. auto &DL = I.getModule()->getDataLayout();
  3553. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3554. Info.memVT = getValueType(DL, I.getType());
  3555. Info.ptrVal = I.getArgOperand(0);
  3556. Info.offset = 0;
  3557. Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
  3558. Info.align.reset();
  3559. return true;
  3560. }
  3561. case Intrinsic::nvvm_ldu_global_i:
  3562. case Intrinsic::nvvm_ldu_global_f:
  3563. case Intrinsic::nvvm_ldu_global_p: {
  3564. auto &DL = I.getModule()->getDataLayout();
  3565. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3566. if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
  3567. Info.memVT = getValueType(DL, I.getType());
  3568. else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
  3569. Info.memVT = getPointerTy(DL);
  3570. else
  3571. Info.memVT = getValueType(DL, I.getType());
  3572. Info.ptrVal = I.getArgOperand(0);
  3573. Info.offset = 0;
  3574. Info.flags = MachineMemOperand::MOLoad;
  3575. Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
  3576. return true;
  3577. }
  3578. case Intrinsic::nvvm_ldg_global_i:
  3579. case Intrinsic::nvvm_ldg_global_f:
  3580. case Intrinsic::nvvm_ldg_global_p: {
  3581. auto &DL = I.getModule()->getDataLayout();
  3582. Info.opc = ISD::INTRINSIC_W_CHAIN;
  3583. if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
  3584. Info.memVT = getValueType(DL, I.getType());
  3585. else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
  3586. Info.memVT = getPointerTy(DL);
  3587. else
  3588. Info.memVT = getValueType(DL, I.getType());
  3589. Info.ptrVal = I.getArgOperand(0);
  3590. Info.offset = 0;
  3591. Info.flags = MachineMemOperand::MOLoad;
  3592. Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
  3593. return true;
  3594. }
  3595. case Intrinsic::nvvm_tex_1d_v4f32_s32:
  3596. case Intrinsic::nvvm_tex_1d_v4f32_f32:
  3597. case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
  3598. case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
  3599. case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
  3600. case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
  3601. case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
  3602. case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
  3603. case Intrinsic::nvvm_tex_2d_v4f32_s32:
  3604. case Intrinsic::nvvm_tex_2d_v4f32_f32:
  3605. case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
  3606. case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
  3607. case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
  3608. case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
  3609. case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
  3610. case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
  3611. case Intrinsic::nvvm_tex_3d_v4f32_s32:
  3612. case Intrinsic::nvvm_tex_3d_v4f32_f32:
  3613. case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
  3614. case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
  3615. case Intrinsic::nvvm_tex_cube_v4f32_f32:
  3616. case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
  3617. case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
  3618. case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
  3619. case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
  3620. case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
  3621. case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
  3622. case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
  3623. case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
  3624. case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
  3625. case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
  3626. case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
  3627. case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
  3628. case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
  3629. case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
  3630. case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
  3631. case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
  3632. case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
  3633. case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
  3634. case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
  3635. case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
  3636. case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
  3637. case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
  3638. case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
  3639. case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
  3640. case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
  3641. case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
  3642. case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
  3643. case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
  3644. case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
  3645. case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
  3646. case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
  3647. case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
  3648. case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
  3649. case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
  3650. case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
  3651. Info.opc = getOpcForTextureInstr(Intrinsic);
  3652. Info.memVT = MVT::v4f32;
  3653. Info.ptrVal = nullptr;
  3654. Info.offset = 0;
  3655. Info.flags = MachineMemOperand::MOLoad;
  3656. Info.align = Align(16);
  3657. return true;
  3658. case Intrinsic::nvvm_tex_1d_v4s32_s32:
  3659. case Intrinsic::nvvm_tex_1d_v4s32_f32:
  3660. case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
  3661. case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
  3662. case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
  3663. case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
  3664. case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
  3665. case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
  3666. case Intrinsic::nvvm_tex_2d_v4s32_s32:
  3667. case Intrinsic::nvvm_tex_2d_v4s32_f32:
  3668. case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
  3669. case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
  3670. case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
  3671. case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
  3672. case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
  3673. case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
  3674. case Intrinsic::nvvm_tex_3d_v4s32_s32:
  3675. case Intrinsic::nvvm_tex_3d_v4s32_f32:
  3676. case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
  3677. case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
  3678. case Intrinsic::nvvm_tex_cube_v4s32_f32:
  3679. case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
  3680. case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
  3681. case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
  3682. case Intrinsic::nvvm_tex_cube_v4u32_f32:
  3683. case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
  3684. case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
  3685. case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
  3686. case Intrinsic::nvvm_tex_1d_v4u32_s32:
  3687. case Intrinsic::nvvm_tex_1d_v4u32_f32:
  3688. case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
  3689. case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
  3690. case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
  3691. case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
  3692. case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
  3693. case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
  3694. case Intrinsic::nvvm_tex_2d_v4u32_s32:
  3695. case Intrinsic::nvvm_tex_2d_v4u32_f32:
  3696. case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
  3697. case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
  3698. case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
  3699. case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
  3700. case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
  3701. case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
  3702. case Intrinsic::nvvm_tex_3d_v4u32_s32:
  3703. case Intrinsic::nvvm_tex_3d_v4u32_f32:
  3704. case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
  3705. case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
  3706. case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
  3707. case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
  3708. case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
  3709. case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
  3710. case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
  3711. case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
  3712. case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
  3713. case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
  3714. case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
  3715. case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
  3716. case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
  3717. case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
  3718. case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
  3719. case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
  3720. case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
  3721. case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
  3722. case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
  3723. case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
  3724. case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
  3725. case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
  3726. case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
  3727. case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
  3728. case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
  3729. case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
  3730. case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
  3731. case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
  3732. case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
  3733. case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
  3734. case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
  3735. case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
  3736. case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
  3737. case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
  3738. case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
  3739. case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
  3740. case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
  3741. case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
  3742. case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
  3743. case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
  3744. case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
  3745. case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
  3746. case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
  3747. case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
  3748. case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
  3749. case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
  3750. case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
  3751. case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
  3752. case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
  3753. case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
  3754. case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
  3755. case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
  3756. case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
  3757. case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
  3758. case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
  3759. case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
  3760. case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
  3761. case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
  3762. case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
  3763. case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
  3764. case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
  3765. case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
  3766. case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
  3767. case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
  3768. case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
  3769. case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
  3770. Info.opc = getOpcForTextureInstr(Intrinsic);
  3771. Info.memVT = MVT::v4i32;
  3772. Info.ptrVal = nullptr;
  3773. Info.offset = 0;
  3774. Info.flags = MachineMemOperand::MOLoad;
  3775. Info.align = Align(16);
  3776. return true;
  3777. case Intrinsic::nvvm_suld_1d_i8_clamp:
  3778. case Intrinsic::nvvm_suld_1d_v2i8_clamp:
  3779. case Intrinsic::nvvm_suld_1d_v4i8_clamp:
  3780. case Intrinsic::nvvm_suld_1d_array_i8_clamp:
  3781. case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
  3782. case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
  3783. case Intrinsic::nvvm_suld_2d_i8_clamp:
  3784. case Intrinsic::nvvm_suld_2d_v2i8_clamp:
  3785. case Intrinsic::nvvm_suld_2d_v4i8_clamp:
  3786. case Intrinsic::nvvm_suld_2d_array_i8_clamp:
  3787. case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
  3788. case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
  3789. case Intrinsic::nvvm_suld_3d_i8_clamp:
  3790. case Intrinsic::nvvm_suld_3d_v2i8_clamp:
  3791. case Intrinsic::nvvm_suld_3d_v4i8_clamp:
  3792. case Intrinsic::nvvm_suld_1d_i8_trap:
  3793. case Intrinsic::nvvm_suld_1d_v2i8_trap:
  3794. case Intrinsic::nvvm_suld_1d_v4i8_trap:
  3795. case Intrinsic::nvvm_suld_1d_array_i8_trap:
  3796. case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
  3797. case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
  3798. case Intrinsic::nvvm_suld_2d_i8_trap:
  3799. case Intrinsic::nvvm_suld_2d_v2i8_trap:
  3800. case Intrinsic::nvvm_suld_2d_v4i8_trap:
  3801. case Intrinsic::nvvm_suld_2d_array_i8_trap:
  3802. case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
  3803. case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
  3804. case Intrinsic::nvvm_suld_3d_i8_trap:
  3805. case Intrinsic::nvvm_suld_3d_v2i8_trap:
  3806. case Intrinsic::nvvm_suld_3d_v4i8_trap:
  3807. case Intrinsic::nvvm_suld_1d_i8_zero:
  3808. case Intrinsic::nvvm_suld_1d_v2i8_zero:
  3809. case Intrinsic::nvvm_suld_1d_v4i8_zero:
  3810. case Intrinsic::nvvm_suld_1d_array_i8_zero:
  3811. case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
  3812. case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
  3813. case Intrinsic::nvvm_suld_2d_i8_zero:
  3814. case Intrinsic::nvvm_suld_2d_v2i8_zero:
  3815. case Intrinsic::nvvm_suld_2d_v4i8_zero:
  3816. case Intrinsic::nvvm_suld_2d_array_i8_zero:
  3817. case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
  3818. case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
  3819. case Intrinsic::nvvm_suld_3d_i8_zero:
  3820. case Intrinsic::nvvm_suld_3d_v2i8_zero:
  3821. case Intrinsic::nvvm_suld_3d_v4i8_zero:
  3822. Info.opc = getOpcForSurfaceInstr(Intrinsic);
  3823. Info.memVT = MVT::i8;
  3824. Info.ptrVal = nullptr;
  3825. Info.offset = 0;
  3826. Info.flags = MachineMemOperand::MOLoad;
  3827. Info.align = Align(16);
  3828. return true;
  3829. case Intrinsic::nvvm_suld_1d_i16_clamp:
  3830. case Intrinsic::nvvm_suld_1d_v2i16_clamp:
  3831. case Intrinsic::nvvm_suld_1d_v4i16_clamp:
  3832. case Intrinsic::nvvm_suld_1d_array_i16_clamp:
  3833. case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
  3834. case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
  3835. case Intrinsic::nvvm_suld_2d_i16_clamp:
  3836. case Intrinsic::nvvm_suld_2d_v2i16_clamp:
  3837. case Intrinsic::nvvm_suld_2d_v4i16_clamp:
  3838. case Intrinsic::nvvm_suld_2d_array_i16_clamp:
  3839. case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
  3840. case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
  3841. case Intrinsic::nvvm_suld_3d_i16_clamp:
  3842. case Intrinsic::nvvm_suld_3d_v2i16_clamp:
  3843. case Intrinsic::nvvm_suld_3d_v4i16_clamp:
  3844. case Intrinsic::nvvm_suld_1d_i16_trap:
  3845. case Intrinsic::nvvm_suld_1d_v2i16_trap:
  3846. case Intrinsic::nvvm_suld_1d_v4i16_trap:
  3847. case Intrinsic::nvvm_suld_1d_array_i16_trap:
  3848. case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
  3849. case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
  3850. case Intrinsic::nvvm_suld_2d_i16_trap:
  3851. case Intrinsic::nvvm_suld_2d_v2i16_trap:
  3852. case Intrinsic::nvvm_suld_2d_v4i16_trap:
  3853. case Intrinsic::nvvm_suld_2d_array_i16_trap:
  3854. case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
  3855. case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
  3856. case Intrinsic::nvvm_suld_3d_i16_trap:
  3857. case Intrinsic::nvvm_suld_3d_v2i16_trap:
  3858. case Intrinsic::nvvm_suld_3d_v4i16_trap:
  3859. case Intrinsic::nvvm_suld_1d_i16_zero:
  3860. case Intrinsic::nvvm_suld_1d_v2i16_zero:
  3861. case Intrinsic::nvvm_suld_1d_v4i16_zero:
  3862. case Intrinsic::nvvm_suld_1d_array_i16_zero:
  3863. case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
  3864. case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
  3865. case Intrinsic::nvvm_suld_2d_i16_zero:
  3866. case Intrinsic::nvvm_suld_2d_v2i16_zero:
  3867. case Intrinsic::nvvm_suld_2d_v4i16_zero:
  3868. case Intrinsic::nvvm_suld_2d_array_i16_zero:
  3869. case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
  3870. case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
  3871. case Intrinsic::nvvm_suld_3d_i16_zero:
  3872. case Intrinsic::nvvm_suld_3d_v2i16_zero:
  3873. case Intrinsic::nvvm_suld_3d_v4i16_zero:
  3874. Info.opc = getOpcForSurfaceInstr(Intrinsic);
  3875. Info.memVT = MVT::i16;
  3876. Info.ptrVal = nullptr;
  3877. Info.offset = 0;
  3878. Info.flags = MachineMemOperand::MOLoad;
  3879. Info.align = Align(16);
  3880. return true;
  3881. case Intrinsic::nvvm_suld_1d_i32_clamp:
  3882. case Intrinsic::nvvm_suld_1d_v2i32_clamp:
  3883. case Intrinsic::nvvm_suld_1d_v4i32_clamp:
  3884. case Intrinsic::nvvm_suld_1d_array_i32_clamp:
  3885. case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
  3886. case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
  3887. case Intrinsic::nvvm_suld_2d_i32_clamp:
  3888. case Intrinsic::nvvm_suld_2d_v2i32_clamp:
  3889. case Intrinsic::nvvm_suld_2d_v4i32_clamp:
  3890. case Intrinsic::nvvm_suld_2d_array_i32_clamp:
  3891. case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
  3892. case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
  3893. case Intrinsic::nvvm_suld_3d_i32_clamp:
  3894. case Intrinsic::nvvm_suld_3d_v2i32_clamp:
  3895. case Intrinsic::nvvm_suld_3d_v4i32_clamp:
  3896. case Intrinsic::nvvm_suld_1d_i32_trap:
  3897. case Intrinsic::nvvm_suld_1d_v2i32_trap:
  3898. case Intrinsic::nvvm_suld_1d_v4i32_trap:
  3899. case Intrinsic::nvvm_suld_1d_array_i32_trap:
  3900. case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
  3901. case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
  3902. case Intrinsic::nvvm_suld_2d_i32_trap:
  3903. case Intrinsic::nvvm_suld_2d_v2i32_trap:
  3904. case Intrinsic::nvvm_suld_2d_v4i32_trap:
  3905. case Intrinsic::nvvm_suld_2d_array_i32_trap:
  3906. case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
  3907. case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
  3908. case Intrinsic::nvvm_suld_3d_i32_trap:
  3909. case Intrinsic::nvvm_suld_3d_v2i32_trap:
  3910. case Intrinsic::nvvm_suld_3d_v4i32_trap:
  3911. case Intrinsic::nvvm_suld_1d_i32_zero:
  3912. case Intrinsic::nvvm_suld_1d_v2i32_zero:
  3913. case Intrinsic::nvvm_suld_1d_v4i32_zero:
  3914. case Intrinsic::nvvm_suld_1d_array_i32_zero:
  3915. case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
  3916. case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
  3917. case Intrinsic::nvvm_suld_2d_i32_zero:
  3918. case Intrinsic::nvvm_suld_2d_v2i32_zero:
  3919. case Intrinsic::nvvm_suld_2d_v4i32_zero:
  3920. case Intrinsic::nvvm_suld_2d_array_i32_zero:
  3921. case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
  3922. case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
  3923. case Intrinsic::nvvm_suld_3d_i32_zero:
  3924. case Intrinsic::nvvm_suld_3d_v2i32_zero:
  3925. case Intrinsic::nvvm_suld_3d_v4i32_zero:
  3926. Info.opc = getOpcForSurfaceInstr(Intrinsic);
  3927. Info.memVT = MVT::i32;
  3928. Info.ptrVal = nullptr;
  3929. Info.offset = 0;
  3930. Info.flags = MachineMemOperand::MOLoad;
  3931. Info.align = Align(16);
  3932. return true;
  3933. case Intrinsic::nvvm_suld_1d_i64_clamp:
  3934. case Intrinsic::nvvm_suld_1d_v2i64_clamp:
  3935. case Intrinsic::nvvm_suld_1d_array_i64_clamp:
  3936. case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
  3937. case Intrinsic::nvvm_suld_2d_i64_clamp:
  3938. case Intrinsic::nvvm_suld_2d_v2i64_clamp:
  3939. case Intrinsic::nvvm_suld_2d_array_i64_clamp:
  3940. case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
  3941. case Intrinsic::nvvm_suld_3d_i64_clamp:
  3942. case Intrinsic::nvvm_suld_3d_v2i64_clamp:
  3943. case Intrinsic::nvvm_suld_1d_i64_trap:
  3944. case Intrinsic::nvvm_suld_1d_v2i64_trap:
  3945. case Intrinsic::nvvm_suld_1d_array_i64_trap:
  3946. case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
  3947. case Intrinsic::nvvm_suld_2d_i64_trap:
  3948. case Intrinsic::nvvm_suld_2d_v2i64_trap:
  3949. case Intrinsic::nvvm_suld_2d_array_i64_trap:
  3950. case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
  3951. case Intrinsic::nvvm_suld_3d_i64_trap:
  3952. case Intrinsic::nvvm_suld_3d_v2i64_trap:
  3953. case Intrinsic::nvvm_suld_1d_i64_zero:
  3954. case Intrinsic::nvvm_suld_1d_v2i64_zero:
  3955. case Intrinsic::nvvm_suld_1d_array_i64_zero:
  3956. case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
  3957. case Intrinsic::nvvm_suld_2d_i64_zero:
  3958. case Intrinsic::nvvm_suld_2d_v2i64_zero:
  3959. case Intrinsic::nvvm_suld_2d_array_i64_zero:
  3960. case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
  3961. case Intrinsic::nvvm_suld_3d_i64_zero:
  3962. case Intrinsic::nvvm_suld_3d_v2i64_zero:
  3963. Info.opc = getOpcForSurfaceInstr(Intrinsic);
  3964. Info.memVT = MVT::i64;
  3965. Info.ptrVal = nullptr;
  3966. Info.offset = 0;
  3967. Info.flags = MachineMemOperand::MOLoad;
  3968. Info.align = Align(16);
  3969. return true;
  3970. }
  3971. return false;
  3972. }
  3973. /// isLegalAddressingMode - Return true if the addressing mode represented
  3974. /// by AM is legal for this target, for a load/store of the specified type.
  3975. /// Used to guide target specific optimizations, like loop strength reduction
  3976. /// (LoopStrengthReduce.cpp) and memory optimization for address mode
  3977. /// (CodeGenPrepare.cpp)
  3978. bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
  3979. const AddrMode &AM, Type *Ty,
  3980. unsigned AS, Instruction *I) const {
  3981. // AddrMode - This represents an addressing mode of:
  3982. // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
  3983. //
  3984. // The legal address modes are
  3985. // - [avar]
  3986. // - [areg]
  3987. // - [areg+immoff]
  3988. // - [immAddr]
  3989. if (AM.BaseGV) {
  3990. return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
  3991. }
  3992. switch (AM.Scale) {
  3993. case 0: // "r", "r+i" or "i" is allowed
  3994. break;
  3995. case 1:
  3996. if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
  3997. return false;
  3998. // Otherwise we have r+i.
  3999. break;
  4000. default:
  4001. // No scale > 1 is allowed
  4002. return false;
  4003. }
  4004. return true;
  4005. }
  4006. //===----------------------------------------------------------------------===//
  4007. // NVPTX Inline Assembly Support
  4008. //===----------------------------------------------------------------------===//
  4009. /// getConstraintType - Given a constraint letter, return the type of
  4010. /// constraint it is for this target.
  4011. NVPTXTargetLowering::ConstraintType
  4012. NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
  4013. if (Constraint.size() == 1) {
  4014. switch (Constraint[0]) {
  4015. default:
  4016. break;
  4017. case 'b':
  4018. case 'r':
  4019. case 'h':
  4020. case 'c':
  4021. case 'l':
  4022. case 'f':
  4023. case 'd':
  4024. case '0':
  4025. case 'N':
  4026. return C_RegisterClass;
  4027. }
  4028. }
  4029. return TargetLowering::getConstraintType(Constraint);
  4030. }
  4031. std::pair<unsigned, const TargetRegisterClass *>
  4032. NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
  4033. StringRef Constraint,
  4034. MVT VT) const {
  4035. if (Constraint.size() == 1) {
  4036. switch (Constraint[0]) {
  4037. case 'b':
  4038. return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
  4039. case 'c':
  4040. return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
  4041. case 'h':
  4042. return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
  4043. case 'r':
  4044. return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
  4045. case 'l':
  4046. case 'N':
  4047. return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
  4048. case 'f':
  4049. return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
  4050. case 'd':
  4051. return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
  4052. }
  4053. }
  4054. return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
  4055. }
  4056. //===----------------------------------------------------------------------===//
  4057. // NVPTX DAG Combining
  4058. //===----------------------------------------------------------------------===//
  4059. bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
  4060. CodeGenOpt::Level OptLevel) const {
  4061. // Always honor command-line argument
  4062. if (FMAContractLevelOpt.getNumOccurrences() > 0)
  4063. return FMAContractLevelOpt > 0;
  4064. // Do not contract if we're not optimizing the code.
  4065. if (OptLevel == 0)
  4066. return false;
  4067. // Honor TargetOptions flags that explicitly say fusion is okay.
  4068. if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast)
  4069. return true;
  4070. return allowUnsafeFPMath(MF);
  4071. }
  4072. bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const {
  4073. // Honor TargetOptions flags that explicitly say unsafe math is okay.
  4074. if (MF.getTarget().Options.UnsafeFPMath)
  4075. return true;
  4076. // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
  4077. const Function &F = MF.getFunction();
  4078. return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
  4079. }
  4080. /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
  4081. /// operands N0 and N1. This is a helper for PerformADDCombine that is
  4082. /// called with the default operands, and if that fails, with commuted
  4083. /// operands.
  4084. static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
  4085. TargetLowering::DAGCombinerInfo &DCI,
  4086. const NVPTXSubtarget &Subtarget,
  4087. CodeGenOpt::Level OptLevel) {
  4088. SelectionDAG &DAG = DCI.DAG;
  4089. // Skip non-integer, non-scalar case
  4090. EVT VT=N0.getValueType();
  4091. if (VT.isVector())
  4092. return SDValue();
  4093. // fold (add (mul a, b), c) -> (mad a, b, c)
  4094. //
  4095. if (N0.getOpcode() == ISD::MUL) {
  4096. assert (VT.isInteger());
  4097. // For integer:
  4098. // Since integer multiply-add costs the same as integer multiply
  4099. // but is more costly than integer add, do the fusion only when
  4100. // the mul is only used in the add.
  4101. if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
  4102. !N0.getNode()->hasOneUse())
  4103. return SDValue();
  4104. // Do the folding
  4105. return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
  4106. N0.getOperand(0), N0.getOperand(1), N1);
  4107. }
  4108. else if (N0.getOpcode() == ISD::FMUL) {
  4109. if (VT == MVT::f32 || VT == MVT::f64) {
  4110. const auto *TLI = static_cast<const NVPTXTargetLowering *>(
  4111. &DAG.getTargetLoweringInfo());
  4112. if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
  4113. return SDValue();
  4114. // For floating point:
  4115. // Do the fusion only when the mul has less than 5 uses and all
  4116. // are add.
  4117. // The heuristic is that if a use is not an add, then that use
  4118. // cannot be fused into fma, therefore mul is still needed anyway.
  4119. // If there are more than 4 uses, even if they are all add, fusing
  4120. // them will increase register pressue.
  4121. //
  4122. int numUses = 0;
  4123. int nonAddCount = 0;
  4124. for (const SDNode *User : N0.getNode()->uses()) {
  4125. numUses++;
  4126. if (User->getOpcode() != ISD::FADD)
  4127. ++nonAddCount;
  4128. }
  4129. if (numUses >= 5)
  4130. return SDValue();
  4131. if (nonAddCount) {
  4132. int orderNo = N->getIROrder();
  4133. int orderNo2 = N0.getNode()->getIROrder();
  4134. // simple heuristics here for considering potential register
  4135. // pressure, the logics here is that the differnce are used
  4136. // to measure the distance between def and use, the longer distance
  4137. // more likely cause register pressure.
  4138. if (orderNo - orderNo2 < 500)
  4139. return SDValue();
  4140. // Now, check if at least one of the FMUL's operands is live beyond the node N,
  4141. // which guarantees that the FMA will not increase register pressure at node N.
  4142. bool opIsLive = false;
  4143. const SDNode *left = N0.getOperand(0).getNode();
  4144. const SDNode *right = N0.getOperand(1).getNode();
  4145. if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
  4146. opIsLive = true;
  4147. if (!opIsLive)
  4148. for (const SDNode *User : left->uses()) {
  4149. int orderNo3 = User->getIROrder();
  4150. if (orderNo3 > orderNo) {
  4151. opIsLive = true;
  4152. break;
  4153. }
  4154. }
  4155. if (!opIsLive)
  4156. for (const SDNode *User : right->uses()) {
  4157. int orderNo3 = User->getIROrder();
  4158. if (orderNo3 > orderNo) {
  4159. opIsLive = true;
  4160. break;
  4161. }
  4162. }
  4163. if (!opIsLive)
  4164. return SDValue();
  4165. }
  4166. return DAG.getNode(ISD::FMA, SDLoc(N), VT,
  4167. N0.getOperand(0), N0.getOperand(1), N1);
  4168. }
  4169. }
  4170. return SDValue();
  4171. }
  4172. /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
  4173. ///
  4174. static SDValue PerformADDCombine(SDNode *N,
  4175. TargetLowering::DAGCombinerInfo &DCI,
  4176. const NVPTXSubtarget &Subtarget,
  4177. CodeGenOpt::Level OptLevel) {
  4178. SDValue N0 = N->getOperand(0);
  4179. SDValue N1 = N->getOperand(1);
  4180. // First try with the default operand order.
  4181. if (SDValue Result =
  4182. PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
  4183. return Result;
  4184. // If that didn't work, try again with the operands commuted.
  4185. return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
  4186. }
  4187. static SDValue PerformANDCombine(SDNode *N,
  4188. TargetLowering::DAGCombinerInfo &DCI) {
  4189. // The type legalizer turns a vector load of i8 values into a zextload to i16
  4190. // registers, optionally ANY_EXTENDs it (if target type is integer),
  4191. // and ANDs off the high 8 bits. Since we turn this load into a
  4192. // target-specific DAG node, the DAG combiner fails to eliminate these AND
  4193. // nodes. Do that here.
  4194. SDValue Val = N->getOperand(0);
  4195. SDValue Mask = N->getOperand(1);
  4196. if (isa<ConstantSDNode>(Val)) {
  4197. std::swap(Val, Mask);
  4198. }
  4199. SDValue AExt;
  4200. // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
  4201. if (Val.getOpcode() == ISD::ANY_EXTEND) {
  4202. AExt = Val;
  4203. Val = Val->getOperand(0);
  4204. }
  4205. if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
  4206. Val = Val->getOperand(0);
  4207. }
  4208. if (Val->getOpcode() == NVPTXISD::LoadV2 ||
  4209. Val->getOpcode() == NVPTXISD::LoadV4) {
  4210. ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
  4211. if (!MaskCnst) {
  4212. // Not an AND with a constant
  4213. return SDValue();
  4214. }
  4215. uint64_t MaskVal = MaskCnst->getZExtValue();
  4216. if (MaskVal != 0xff) {
  4217. // Not an AND that chops off top 8 bits
  4218. return SDValue();
  4219. }
  4220. MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
  4221. if (!Mem) {
  4222. // Not a MemSDNode?!?
  4223. return SDValue();
  4224. }
  4225. EVT MemVT = Mem->getMemoryVT();
  4226. if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
  4227. // We only handle the i8 case
  4228. return SDValue();
  4229. }
  4230. unsigned ExtType =
  4231. cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
  4232. getZExtValue();
  4233. if (ExtType == ISD::SEXTLOAD) {
  4234. // If for some reason the load is a sextload, the and is needed to zero
  4235. // out the high 8 bits
  4236. return SDValue();
  4237. }
  4238. bool AddTo = false;
  4239. if (AExt.getNode() != nullptr) {
  4240. // Re-insert the ext as a zext.
  4241. Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
  4242. AExt.getValueType(), Val);
  4243. AddTo = true;
  4244. }
  4245. // If we get here, the AND is unnecessary. Just replace it with the load
  4246. DCI.CombineTo(N, Val, AddTo);
  4247. }
  4248. return SDValue();
  4249. }
  4250. static SDValue PerformREMCombine(SDNode *N,
  4251. TargetLowering::DAGCombinerInfo &DCI,
  4252. CodeGenOpt::Level OptLevel) {
  4253. assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
  4254. // Don't do anything at less than -O2.
  4255. if (OptLevel < CodeGenOpt::Default)
  4256. return SDValue();
  4257. SelectionDAG &DAG = DCI.DAG;
  4258. SDLoc DL(N);
  4259. EVT VT = N->getValueType(0);
  4260. bool IsSigned = N->getOpcode() == ISD::SREM;
  4261. unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
  4262. const SDValue &Num = N->getOperand(0);
  4263. const SDValue &Den = N->getOperand(1);
  4264. for (const SDNode *U : Num->uses()) {
  4265. if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
  4266. U->getOperand(1) == Den) {
  4267. // Num % Den -> Num - (Num / Den) * Den
  4268. return DAG.getNode(ISD::SUB, DL, VT, Num,
  4269. DAG.getNode(ISD::MUL, DL, VT,
  4270. DAG.getNode(DivOpc, DL, VT, Num, Den),
  4271. Den));
  4272. }
  4273. }
  4274. return SDValue();
  4275. }
  4276. enum OperandSignedness {
  4277. Signed = 0,
  4278. Unsigned,
  4279. Unknown
  4280. };
  4281. /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
  4282. /// that can be demoted to \p OptSize bits without loss of information. The
  4283. /// signedness of the operand, if determinable, is placed in \p S.
  4284. static bool IsMulWideOperandDemotable(SDValue Op,
  4285. unsigned OptSize,
  4286. OperandSignedness &S) {
  4287. S = Unknown;
  4288. if (Op.getOpcode() == ISD::SIGN_EXTEND ||
  4289. Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
  4290. EVT OrigVT = Op.getOperand(0).getValueType();
  4291. if (OrigVT.getFixedSizeInBits() <= OptSize) {
  4292. S = Signed;
  4293. return true;
  4294. }
  4295. } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
  4296. EVT OrigVT = Op.getOperand(0).getValueType();
  4297. if (OrigVT.getFixedSizeInBits() <= OptSize) {
  4298. S = Unsigned;
  4299. return true;
  4300. }
  4301. }
  4302. return false;
  4303. }
  4304. /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
  4305. /// be demoted to \p OptSize bits without loss of information. If the operands
  4306. /// contain a constant, it should appear as the RHS operand. The signedness of
  4307. /// the operands is placed in \p IsSigned.
  4308. static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
  4309. unsigned OptSize,
  4310. bool &IsSigned) {
  4311. OperandSignedness LHSSign;
  4312. // The LHS operand must be a demotable op
  4313. if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
  4314. return false;
  4315. // We should have been able to determine the signedness from the LHS
  4316. if (LHSSign == Unknown)
  4317. return false;
  4318. IsSigned = (LHSSign == Signed);
  4319. // The RHS can be a demotable op or a constant
  4320. if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
  4321. const APInt &Val = CI->getAPIntValue();
  4322. if (LHSSign == Unsigned) {
  4323. return Val.isIntN(OptSize);
  4324. } else {
  4325. return Val.isSignedIntN(OptSize);
  4326. }
  4327. } else {
  4328. OperandSignedness RHSSign;
  4329. if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
  4330. return false;
  4331. return LHSSign == RHSSign;
  4332. }
  4333. }
  4334. /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
  4335. /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
  4336. /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
  4337. /// amount.
  4338. static SDValue TryMULWIDECombine(SDNode *N,
  4339. TargetLowering::DAGCombinerInfo &DCI) {
  4340. EVT MulType = N->getValueType(0);
  4341. if (MulType != MVT::i32 && MulType != MVT::i64) {
  4342. return SDValue();
  4343. }
  4344. SDLoc DL(N);
  4345. unsigned OptSize = MulType.getSizeInBits() >> 1;
  4346. SDValue LHS = N->getOperand(0);
  4347. SDValue RHS = N->getOperand(1);
  4348. // Canonicalize the multiply so the constant (if any) is on the right
  4349. if (N->getOpcode() == ISD::MUL) {
  4350. if (isa<ConstantSDNode>(LHS)) {
  4351. std::swap(LHS, RHS);
  4352. }
  4353. }
  4354. // If we have a SHL, determine the actual multiply amount
  4355. if (N->getOpcode() == ISD::SHL) {
  4356. ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
  4357. if (!ShlRHS) {
  4358. return SDValue();
  4359. }
  4360. APInt ShiftAmt = ShlRHS->getAPIntValue();
  4361. unsigned BitWidth = MulType.getSizeInBits();
  4362. if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
  4363. APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
  4364. RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
  4365. } else {
  4366. return SDValue();
  4367. }
  4368. }
  4369. bool Signed;
  4370. // Verify that our operands are demotable
  4371. if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
  4372. return SDValue();
  4373. }
  4374. EVT DemotedVT;
  4375. if (MulType == MVT::i32) {
  4376. DemotedVT = MVT::i16;
  4377. } else {
  4378. DemotedVT = MVT::i32;
  4379. }
  4380. // Truncate the operands to the correct size. Note that these are just for
  4381. // type consistency and will (likely) be eliminated in later phases.
  4382. SDValue TruncLHS =
  4383. DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
  4384. SDValue TruncRHS =
  4385. DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
  4386. unsigned Opc;
  4387. if (Signed) {
  4388. Opc = NVPTXISD::MUL_WIDE_SIGNED;
  4389. } else {
  4390. Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
  4391. }
  4392. return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
  4393. }
  4394. /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
  4395. static SDValue PerformMULCombine(SDNode *N,
  4396. TargetLowering::DAGCombinerInfo &DCI,
  4397. CodeGenOpt::Level OptLevel) {
  4398. if (OptLevel > 0) {
  4399. // Try mul.wide combining at OptLevel > 0
  4400. if (SDValue Ret = TryMULWIDECombine(N, DCI))
  4401. return Ret;
  4402. }
  4403. return SDValue();
  4404. }
  4405. /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
  4406. static SDValue PerformSHLCombine(SDNode *N,
  4407. TargetLowering::DAGCombinerInfo &DCI,
  4408. CodeGenOpt::Level OptLevel) {
  4409. if (OptLevel > 0) {
  4410. // Try mul.wide combining at OptLevel > 0
  4411. if (SDValue Ret = TryMULWIDECombine(N, DCI))
  4412. return Ret;
  4413. }
  4414. return SDValue();
  4415. }
  4416. static SDValue PerformSETCCCombine(SDNode *N,
  4417. TargetLowering::DAGCombinerInfo &DCI) {
  4418. EVT CCType = N->getValueType(0);
  4419. SDValue A = N->getOperand(0);
  4420. SDValue B = N->getOperand(1);
  4421. if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16)
  4422. return SDValue();
  4423. SDLoc DL(N);
  4424. // setp.f16x2 returns two scalar predicates, which we need to
  4425. // convert back to v2i1. The returned result will be scalarized by
  4426. // the legalizer, but the comparison will remain a single vector
  4427. // instruction.
  4428. SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL,
  4429. DCI.DAG.getVTList(MVT::i1, MVT::i1),
  4430. {A, B, N->getOperand(2)});
  4431. return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
  4432. CCNode.getValue(1));
  4433. }
  4434. SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
  4435. DAGCombinerInfo &DCI) const {
  4436. CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel();
  4437. switch (N->getOpcode()) {
  4438. default: break;
  4439. case ISD::ADD:
  4440. case ISD::FADD:
  4441. return PerformADDCombine(N, DCI, STI, OptLevel);
  4442. case ISD::MUL:
  4443. return PerformMULCombine(N, DCI, OptLevel);
  4444. case ISD::SHL:
  4445. return PerformSHLCombine(N, DCI, OptLevel);
  4446. case ISD::AND:
  4447. return PerformANDCombine(N, DCI);
  4448. case ISD::UREM:
  4449. case ISD::SREM:
  4450. return PerformREMCombine(N, DCI, OptLevel);
  4451. case ISD::SETCC:
  4452. return PerformSETCCCombine(N, DCI);
  4453. }
  4454. return SDValue();
  4455. }
  4456. /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
  4457. static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
  4458. SmallVectorImpl<SDValue> &Results) {
  4459. EVT ResVT = N->getValueType(0);
  4460. SDLoc DL(N);
  4461. assert(ResVT.isVector() && "Vector load must have vector type");
  4462. // We only handle "native" vector sizes for now, e.g. <4 x double> is not
  4463. // legal. We can (and should) split that into 2 loads of <2 x double> here
  4464. // but I'm leaving that as a TODO for now.
  4465. assert(ResVT.isSimple() && "Can only handle simple types");
  4466. switch (ResVT.getSimpleVT().SimpleTy) {
  4467. default:
  4468. return;
  4469. case MVT::v2i8:
  4470. case MVT::v2i16:
  4471. case MVT::v2i32:
  4472. case MVT::v2i64:
  4473. case MVT::v2f16:
  4474. case MVT::v2f32:
  4475. case MVT::v2f64:
  4476. case MVT::v4i8:
  4477. case MVT::v4i16:
  4478. case MVT::v4i32:
  4479. case MVT::v4f16:
  4480. case MVT::v4f32:
  4481. case MVT::v8f16: // <4 x f16x2>
  4482. // This is a "native" vector type
  4483. break;
  4484. }
  4485. LoadSDNode *LD = cast<LoadSDNode>(N);
  4486. Align Alignment = LD->getAlign();
  4487. auto &TD = DAG.getDataLayout();
  4488. Align PrefAlign = TD.getPrefTypeAlign(ResVT.getTypeForEVT(*DAG.getContext()));
  4489. if (Alignment < PrefAlign) {
  4490. // This load is not sufficiently aligned, so bail out and let this vector
  4491. // load be scalarized. Note that we may still be able to emit smaller
  4492. // vector loads. For example, if we are loading a <4 x float> with an
  4493. // alignment of 8, this check will fail but the legalizer will try again
  4494. // with 2 x <2 x float>, which will succeed with an alignment of 8.
  4495. return;
  4496. }
  4497. EVT EltVT = ResVT.getVectorElementType();
  4498. unsigned NumElts = ResVT.getVectorNumElements();
  4499. // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
  4500. // Therefore, we must ensure the type is legal. For i1 and i8, we set the
  4501. // loaded type to i16 and propagate the "real" type as the memory type.
  4502. bool NeedTrunc = false;
  4503. if (EltVT.getSizeInBits() < 16) {
  4504. EltVT = MVT::i16;
  4505. NeedTrunc = true;
  4506. }
  4507. unsigned Opcode = 0;
  4508. SDVTList LdResVTs;
  4509. bool LoadF16x2 = false;
  4510. switch (NumElts) {
  4511. default:
  4512. return;
  4513. case 2:
  4514. Opcode = NVPTXISD::LoadV2;
  4515. LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
  4516. break;
  4517. case 4: {
  4518. Opcode = NVPTXISD::LoadV4;
  4519. EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
  4520. LdResVTs = DAG.getVTList(ListVTs);
  4521. break;
  4522. }
  4523. case 8: {
  4524. // v8f16 is a special case. PTX doesn't have ld.v8.f16
  4525. // instruction. Instead, we split the vector into v2f16 chunks and
  4526. // load them with ld.v4.b32.
  4527. assert(EltVT == MVT::f16 && "Unsupported v8 vector type.");
  4528. LoadF16x2 = true;
  4529. Opcode = NVPTXISD::LoadV4;
  4530. EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16,
  4531. MVT::Other};
  4532. LdResVTs = DAG.getVTList(ListVTs);
  4533. break;
  4534. }
  4535. }
  4536. // Copy regular operands
  4537. SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
  4538. // The select routine does not have access to the LoadSDNode instance, so
  4539. // pass along the extension information
  4540. OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
  4541. SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
  4542. LD->getMemoryVT(),
  4543. LD->getMemOperand());
  4544. SmallVector<SDValue, 8> ScalarRes;
  4545. if (LoadF16x2) {
  4546. // Split v2f16 subvectors back into individual elements.
  4547. NumElts /= 2;
  4548. for (unsigned i = 0; i < NumElts; ++i) {
  4549. SDValue SubVector = NewLD.getValue(i);
  4550. SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
  4551. DAG.getIntPtrConstant(0, DL));
  4552. SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
  4553. DAG.getIntPtrConstant(1, DL));
  4554. ScalarRes.push_back(E0);
  4555. ScalarRes.push_back(E1);
  4556. }
  4557. } else {
  4558. for (unsigned i = 0; i < NumElts; ++i) {
  4559. SDValue Res = NewLD.getValue(i);
  4560. if (NeedTrunc)
  4561. Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
  4562. ScalarRes.push_back(Res);
  4563. }
  4564. }
  4565. SDValue LoadChain = NewLD.getValue(NumElts);
  4566. SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
  4567. Results.push_back(BuildVec);
  4568. Results.push_back(LoadChain);
  4569. }
  4570. static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
  4571. SmallVectorImpl<SDValue> &Results) {
  4572. SDValue Chain = N->getOperand(0);
  4573. SDValue Intrin = N->getOperand(1);
  4574. SDLoc DL(N);
  4575. // Get the intrinsic ID
  4576. unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
  4577. switch (IntrinNo) {
  4578. default:
  4579. return;
  4580. case Intrinsic::nvvm_ldg_global_i:
  4581. case Intrinsic::nvvm_ldg_global_f:
  4582. case Intrinsic::nvvm_ldg_global_p:
  4583. case Intrinsic::nvvm_ldu_global_i:
  4584. case Intrinsic::nvvm_ldu_global_f:
  4585. case Intrinsic::nvvm_ldu_global_p: {
  4586. EVT ResVT = N->getValueType(0);
  4587. if (ResVT.isVector()) {
  4588. // Vector LDG/LDU
  4589. unsigned NumElts = ResVT.getVectorNumElements();
  4590. EVT EltVT = ResVT.getVectorElementType();
  4591. // Since LDU/LDG are target nodes, we cannot rely on DAG type
  4592. // legalization.
  4593. // Therefore, we must ensure the type is legal. For i1 and i8, we set the
  4594. // loaded type to i16 and propagate the "real" type as the memory type.
  4595. bool NeedTrunc = false;
  4596. if (EltVT.getSizeInBits() < 16) {
  4597. EltVT = MVT::i16;
  4598. NeedTrunc = true;
  4599. }
  4600. unsigned Opcode = 0;
  4601. SDVTList LdResVTs;
  4602. switch (NumElts) {
  4603. default:
  4604. return;
  4605. case 2:
  4606. switch (IntrinNo) {
  4607. default:
  4608. return;
  4609. case Intrinsic::nvvm_ldg_global_i:
  4610. case Intrinsic::nvvm_ldg_global_f:
  4611. case Intrinsic::nvvm_ldg_global_p:
  4612. Opcode = NVPTXISD::LDGV2;
  4613. break;
  4614. case Intrinsic::nvvm_ldu_global_i:
  4615. case Intrinsic::nvvm_ldu_global_f:
  4616. case Intrinsic::nvvm_ldu_global_p:
  4617. Opcode = NVPTXISD::LDUV2;
  4618. break;
  4619. }
  4620. LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
  4621. break;
  4622. case 4: {
  4623. switch (IntrinNo) {
  4624. default:
  4625. return;
  4626. case Intrinsic::nvvm_ldg_global_i:
  4627. case Intrinsic::nvvm_ldg_global_f:
  4628. case Intrinsic::nvvm_ldg_global_p:
  4629. Opcode = NVPTXISD::LDGV4;
  4630. break;
  4631. case Intrinsic::nvvm_ldu_global_i:
  4632. case Intrinsic::nvvm_ldu_global_f:
  4633. case Intrinsic::nvvm_ldu_global_p:
  4634. Opcode = NVPTXISD::LDUV4;
  4635. break;
  4636. }
  4637. EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
  4638. LdResVTs = DAG.getVTList(ListVTs);
  4639. break;
  4640. }
  4641. }
  4642. SmallVector<SDValue, 8> OtherOps;
  4643. // Copy regular operands
  4644. OtherOps.push_back(Chain); // Chain
  4645. // Skip operand 1 (intrinsic ID)
  4646. // Others
  4647. OtherOps.append(N->op_begin() + 2, N->op_end());
  4648. MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
  4649. SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
  4650. MemSD->getMemoryVT(),
  4651. MemSD->getMemOperand());
  4652. SmallVector<SDValue, 4> ScalarRes;
  4653. for (unsigned i = 0; i < NumElts; ++i) {
  4654. SDValue Res = NewLD.getValue(i);
  4655. if (NeedTrunc)
  4656. Res =
  4657. DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
  4658. ScalarRes.push_back(Res);
  4659. }
  4660. SDValue LoadChain = NewLD.getValue(NumElts);
  4661. SDValue BuildVec =
  4662. DAG.getBuildVector(ResVT, DL, ScalarRes);
  4663. Results.push_back(BuildVec);
  4664. Results.push_back(LoadChain);
  4665. } else {
  4666. // i8 LDG/LDU
  4667. assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
  4668. "Custom handling of non-i8 ldu/ldg?");
  4669. // Just copy all operands as-is
  4670. SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
  4671. // Force output to i16
  4672. SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
  4673. MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
  4674. // We make sure the memory type is i8, which will be used during isel
  4675. // to select the proper instruction.
  4676. SDValue NewLD =
  4677. DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
  4678. MVT::i8, MemSD->getMemOperand());
  4679. Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
  4680. NewLD.getValue(0)));
  4681. Results.push_back(NewLD.getValue(1));
  4682. }
  4683. }
  4684. }
  4685. }
  4686. void NVPTXTargetLowering::ReplaceNodeResults(
  4687. SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
  4688. switch (N->getOpcode()) {
  4689. default:
  4690. report_fatal_error("Unhandled custom legalization");
  4691. case ISD::LOAD:
  4692. ReplaceLoadVector(N, DAG, Results);
  4693. return;
  4694. case ISD::INTRINSIC_W_CHAIN:
  4695. ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
  4696. return;
  4697. }
  4698. }
  4699. // Pin NVPTXTargetObjectFile's vtables to this file.
  4700. NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {}
  4701. MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
  4702. const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
  4703. return getDataSection();
  4704. }