123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139 |
- //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
- //
- // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- // See https://llvm.org/LICENSE.txt for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- //
- //===----------------------------------------------------------------------===//
- //
- // This file defines the interfaces that NVPTX uses to lower LLVM code into a
- // selection DAG.
- //
- //===----------------------------------------------------------------------===//
- #include "NVPTXISelLowering.h"
- #include "MCTargetDesc/NVPTXBaseInfo.h"
- #include "NVPTX.h"
- #include "NVPTXSubtarget.h"
- #include "NVPTXTargetMachine.h"
- #include "NVPTXTargetObjectFile.h"
- #include "NVPTXUtilities.h"
- #include "llvm/ADT/APInt.h"
- #include "llvm/ADT/STLExtras.h"
- #include "llvm/ADT/SmallVector.h"
- #include "llvm/ADT/StringRef.h"
- #include "llvm/CodeGen/Analysis.h"
- #include "llvm/CodeGen/MachineFunction.h"
- #include "llvm/CodeGen/MachineMemOperand.h"
- #include "llvm/CodeGen/SelectionDAG.h"
- #include "llvm/CodeGen/SelectionDAGNodes.h"
- #include "llvm/CodeGen/TargetCallingConv.h"
- #include "llvm/CodeGen/TargetLowering.h"
- #include "llvm/CodeGen/ValueTypes.h"
- #include "llvm/IR/Argument.h"
- #include "llvm/IR/Attributes.h"
- #include "llvm/IR/Constants.h"
- #include "llvm/IR/DataLayout.h"
- #include "llvm/IR/DerivedTypes.h"
- #include "llvm/IR/Function.h"
- #include "llvm/IR/GlobalValue.h"
- #include "llvm/IR/Instruction.h"
- #include "llvm/IR/Instructions.h"
- #include "llvm/IR/IntrinsicsNVPTX.h"
- #include "llvm/IR/Module.h"
- #include "llvm/IR/Type.h"
- #include "llvm/IR/Value.h"
- #include "llvm/Support/Casting.h"
- #include "llvm/Support/CodeGen.h"
- #include "llvm/Support/CommandLine.h"
- #include "llvm/Support/ErrorHandling.h"
- #include "llvm/Support/MachineValueType.h"
- #include "llvm/Support/MathExtras.h"
- #include "llvm/Support/raw_ostream.h"
- #include "llvm/Target/TargetMachine.h"
- #include "llvm/Target/TargetOptions.h"
- #include <algorithm>
- #include <cassert>
- #include <cstdint>
- #include <iterator>
- #include <sstream>
- #include <string>
- #include <utility>
- #include <vector>
- #define DEBUG_TYPE "nvptx-lower"
- using namespace llvm;
- static std::atomic<unsigned> GlobalUniqueCallSite;
- static cl::opt<bool> sched4reg(
- "nvptx-sched4reg",
- cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
- static cl::opt<unsigned>
- FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
- cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
- " 1: do it 2: do it aggressively"),
- cl::init(2));
- static cl::opt<int> UsePrecDivF32(
- "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
- cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
- " IEEE Compliant F32 div.rnd if available."),
- cl::init(2));
- static cl::opt<bool> UsePrecSqrtF32(
- "nvptx-prec-sqrtf32", cl::Hidden,
- cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
- cl::init(true));
- int NVPTXTargetLowering::getDivF32Level() const {
- if (UsePrecDivF32.getNumOccurrences() > 0) {
- // If nvptx-prec-div32=N is used on the command-line, always honor it
- return UsePrecDivF32;
- } else {
- // Otherwise, use div.approx if fast math is enabled
- if (getTargetMachine().Options.UnsafeFPMath)
- return 0;
- else
- return 2;
- }
- }
- bool NVPTXTargetLowering::usePrecSqrtF32() const {
- if (UsePrecSqrtF32.getNumOccurrences() > 0) {
- // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
- return UsePrecSqrtF32;
- } else {
- // Otherwise, use sqrt.approx if fast math is enabled
- return !getTargetMachine().Options.UnsafeFPMath;
- }
- }
- bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const {
- return MF.getDenormalMode(APFloat::IEEEsingle()).Output ==
- DenormalMode::PreserveSign;
- }
- static bool IsPTXVectorType(MVT VT) {
- switch (VT.SimpleTy) {
- default:
- return false;
- case MVT::v2i1:
- case MVT::v4i1:
- case MVT::v2i8:
- case MVT::v4i8:
- case MVT::v2i16:
- case MVT::v4i16:
- case MVT::v2i32:
- case MVT::v4i32:
- case MVT::v2i64:
- case MVT::v2f16:
- case MVT::v4f16:
- case MVT::v8f16: // <4 x f16x2>
- case MVT::v2f32:
- case MVT::v4f32:
- case MVT::v2f64:
- return true;
- }
- }
- /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
- /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
- /// into their primitive components.
- /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
- /// same number of types as the Ins/Outs arrays in LowerFormalArguments,
- /// LowerCall, and LowerReturn.
- static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
- Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
- SmallVectorImpl<uint64_t> *Offsets = nullptr,
- uint64_t StartingOffset = 0) {
- SmallVector<EVT, 16> TempVTs;
- SmallVector<uint64_t, 16> TempOffsets;
- // Special case for i128 - decompose to (i64, i64)
- if (Ty->isIntegerTy(128)) {
- ValueVTs.push_back(EVT(MVT::i64));
- ValueVTs.push_back(EVT(MVT::i64));
- if (Offsets) {
- Offsets->push_back(StartingOffset + 0);
- Offsets->push_back(StartingOffset + 8);
- }
- return;
- }
- // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
- if (StructType *STy = dyn_cast<StructType>(Ty)) {
- auto const *SL = DL.getStructLayout(STy);
- auto ElementNum = 0;
- for(auto *EI : STy->elements()) {
- ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
- StartingOffset + SL->getElementOffset(ElementNum));
- ++ElementNum;
- }
- return;
- }
- ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
- for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
- EVT VT = TempVTs[i];
- uint64_t Off = TempOffsets[i];
- // Split vectors into individual elements, except for v2f16, which
- // we will pass as a single scalar.
- if (VT.isVector()) {
- unsigned NumElts = VT.getVectorNumElements();
- EVT EltVT = VT.getVectorElementType();
- // Vectors with an even number of f16 elements will be passed to
- // us as an array of v2f16 elements. We must match this so we
- // stay in sync with Ins/Outs.
- if (EltVT == MVT::f16 && NumElts % 2 == 0) {
- EltVT = MVT::v2f16;
- NumElts /= 2;
- }
- for (unsigned j = 0; j != NumElts; ++j) {
- ValueVTs.push_back(EltVT);
- if (Offsets)
- Offsets->push_back(Off + j * EltVT.getStoreSize());
- }
- } else {
- ValueVTs.push_back(VT);
- if (Offsets)
- Offsets->push_back(Off);
- }
- }
- }
- // Check whether we can merge loads/stores of some of the pieces of a
- // flattened function parameter or return value into a single vector
- // load/store.
- //
- // The flattened parameter is represented as a list of EVTs and
- // offsets, and the whole structure is aligned to ParamAlignment. This
- // function determines whether we can load/store pieces of the
- // parameter starting at index Idx using a single vectorized op of
- // size AccessSize. If so, it returns the number of param pieces
- // covered by the vector op. Otherwise, it returns 1.
- static unsigned CanMergeParamLoadStoresStartingAt(
- unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
- const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
- // Can't vectorize if param alignment is not sufficient.
- if (ParamAlignment < AccessSize)
- return 1;
- // Can't vectorize if offset is not aligned.
- if (Offsets[Idx] & (AccessSize - 1))
- return 1;
- EVT EltVT = ValueVTs[Idx];
- unsigned EltSize = EltVT.getStoreSize();
- // Element is too large to vectorize.
- if (EltSize >= AccessSize)
- return 1;
- unsigned NumElts = AccessSize / EltSize;
- // Can't vectorize if AccessBytes if not a multiple of EltSize.
- if (AccessSize != EltSize * NumElts)
- return 1;
- // We don't have enough elements to vectorize.
- if (Idx + NumElts > ValueVTs.size())
- return 1;
- // PTX ISA can only deal with 2- and 4-element vector ops.
- if (NumElts != 4 && NumElts != 2)
- return 1;
- for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
- // Types do not match.
- if (ValueVTs[j] != EltVT)
- return 1;
- // Elements are not contiguous.
- if (Offsets[j] - Offsets[j - 1] != EltSize)
- return 1;
- }
- // OK. We can vectorize ValueVTs[i..i+NumElts)
- return NumElts;
- }
- // Flags for tracking per-element vectorization state of loads/stores
- // of a flattened function parameter or return value.
- enum ParamVectorizationFlags {
- PVF_INNER = 0x0, // Middle elements of a vector.
- PVF_FIRST = 0x1, // First element of the vector.
- PVF_LAST = 0x2, // Last element of the vector.
- // Scalar is effectively a 1-element vector.
- PVF_SCALAR = PVF_FIRST | PVF_LAST
- };
- // Computes whether and how we can vectorize the loads/stores of a
- // flattened function parameter or return value.
- //
- // The flattened parameter is represented as the list of ValueVTs and
- // Offsets, and is aligned to ParamAlignment bytes. We return a vector
- // of the same size as ValueVTs indicating how each piece should be
- // loaded/stored (i.e. as a scalar, or as part of a vector
- // load/store).
- static SmallVector<ParamVectorizationFlags, 16>
- VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
- const SmallVectorImpl<uint64_t> &Offsets,
- Align ParamAlignment) {
- // Set vector size to match ValueVTs and mark all elements as
- // scalars by default.
- SmallVector<ParamVectorizationFlags, 16> VectorInfo;
- VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
- // Check what we can vectorize using 128/64/32-bit accesses.
- for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
- // Skip elements we've already processed.
- assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
- for (unsigned AccessSize : {16, 8, 4, 2}) {
- unsigned NumElts = CanMergeParamLoadStoresStartingAt(
- I, AccessSize, ValueVTs, Offsets, ParamAlignment);
- // Mark vectorized elements.
- switch (NumElts) {
- default:
- llvm_unreachable("Unexpected return value");
- case 1:
- // Can't vectorize using this size, try next smaller size.
- continue;
- case 2:
- assert(I + 1 < E && "Not enough elements.");
- VectorInfo[I] = PVF_FIRST;
- VectorInfo[I + 1] = PVF_LAST;
- I += 1;
- break;
- case 4:
- assert(I + 3 < E && "Not enough elements.");
- VectorInfo[I] = PVF_FIRST;
- VectorInfo[I + 1] = PVF_INNER;
- VectorInfo[I + 2] = PVF_INNER;
- VectorInfo[I + 3] = PVF_LAST;
- I += 3;
- break;
- }
- // Break out of the inner loop because we've already succeeded
- // using largest possible AccessSize.
- break;
- }
- }
- return VectorInfo;
- }
- // NVPTXTargetLowering Constructor.
- NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
- const NVPTXSubtarget &STI)
- : TargetLowering(TM), nvTM(&TM), STI(STI) {
- // always lower memset, memcpy, and memmove intrinsics to load/store
- // instructions, rather
- // then generating calls to memset, mempcy or memmove.
- MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
- MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
- MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
- setBooleanContents(ZeroOrNegativeOneBooleanContent);
- setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
- // Jump is Expensive. Don't create extra control flow for 'and', 'or'
- // condition branches.
- setJumpIsExpensive(true);
- // Wide divides are _very_ slow. Try to reduce the width of the divide if
- // possible.
- addBypassSlowDiv(64, 32);
- // By default, use the Source scheduling
- if (sched4reg)
- setSchedulingPreference(Sched::RegPressure);
- else
- setSchedulingPreference(Sched::Source);
- auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
- LegalizeAction NoF16Action) {
- setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
- };
- addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
- addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
- addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
- addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
- addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
- addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
- addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
- addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
- // Conversion to/from FP16/FP16x2 is always legal.
- setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal);
- setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand);
- setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
- setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
- // Operations not directly supported by NVPTX.
- for (MVT VT : {MVT::f16, MVT::v2f16, MVT::f32, MVT::f64, MVT::i1, MVT::i8,
- MVT::i16, MVT::i32, MVT::i64}) {
- setOperationAction(ISD::SELECT_CC, VT, Expand);
- setOperationAction(ISD::BR_CC, VT, Expand);
- }
- // Some SIGN_EXTEND_INREG can be done using cvt instruction.
- // For others we will expand to a SHL/SRA pair.
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
- setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom);
- setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom);
- setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom);
- setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom);
- setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom);
- setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom);
- setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
- setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
- // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs
- // that don't have h/w rotation we lower them to multi-instruction assembly.
- // See ROT*_sw in NVPTXIntrInfo.td
- setOperationAction(ISD::ROTL, MVT::i64, Legal);
- setOperationAction(ISD::ROTR, MVT::i64, Legal);
- setOperationAction(ISD::ROTL, MVT::i32, Legal);
- setOperationAction(ISD::ROTR, MVT::i32, Legal);
- setOperationAction(ISD::ROTL, MVT::i16, Expand);
- setOperationAction(ISD::ROTR, MVT::i16, Expand);
- setOperationAction(ISD::ROTL, MVT::i8, Expand);
- setOperationAction(ISD::ROTR, MVT::i8, Expand);
- setOperationAction(ISD::BSWAP, MVT::i16, Expand);
- setOperationAction(ISD::BSWAP, MVT::i32, Expand);
- setOperationAction(ISD::BSWAP, MVT::i64, Expand);
- // Indirect branch is not supported.
- // This also disables Jump Table creation.
- setOperationAction(ISD::BR_JT, MVT::Other, Expand);
- setOperationAction(ISD::BRIND, MVT::Other, Expand);
- setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
- setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
- // We want to legalize constant related memmove and memcopy
- // intrinsics.
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
- // Turn FP extload into load/fpextend
- setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
- // Turn FP truncstore into trunc + store.
- // FIXME: vector types should also be expanded
- setTruncStoreAction(MVT::f32, MVT::f16, Expand);
- setTruncStoreAction(MVT::f64, MVT::f16, Expand);
- setTruncStoreAction(MVT::f64, MVT::f32, Expand);
- // PTX does not support load / store predicate registers
- setOperationAction(ISD::LOAD, MVT::i1, Custom);
- setOperationAction(ISD::STORE, MVT::i1, Custom);
- for (MVT VT : MVT::integer_valuetypes()) {
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
- setTruncStoreAction(VT, MVT::i1, Expand);
- }
- // This is legal in NVPTX
- setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
- setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
- setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
- // TRAP can be lowered to PTX trap
- setOperationAction(ISD::TRAP, MVT::Other, Legal);
- // Register custom handling for vector loads/stores
- for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
- if (IsPTXVectorType(VT)) {
- setOperationAction(ISD::LOAD, VT, Custom);
- setOperationAction(ISD::STORE, VT, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
- }
- }
- // Custom handling for i8 intrinsics
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
- for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
- setOperationAction(ISD::ABS, Ty, Legal);
- setOperationAction(ISD::SMIN, Ty, Legal);
- setOperationAction(ISD::SMAX, Ty, Legal);
- setOperationAction(ISD::UMIN, Ty, Legal);
- setOperationAction(ISD::UMAX, Ty, Legal);
- setOperationAction(ISD::CTPOP, Ty, Legal);
- setOperationAction(ISD::CTLZ, Ty, Legal);
- }
- setOperationAction(ISD::CTTZ, MVT::i16, Expand);
- setOperationAction(ISD::CTTZ, MVT::i32, Expand);
- setOperationAction(ISD::CTTZ, MVT::i64, Expand);
- // PTX does not directly support SELP of i1, so promote to i32 first
- setOperationAction(ISD::SELECT, MVT::i1, Custom);
- // PTX cannot multiply two i64s in a single instruction.
- setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
- setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
- // We have some custom DAG combine patterns for these nodes
- setTargetDAGCombine(ISD::ADD);
- setTargetDAGCombine(ISD::AND);
- setTargetDAGCombine(ISD::FADD);
- setTargetDAGCombine(ISD::MUL);
- setTargetDAGCombine(ISD::SHL);
- setTargetDAGCombine(ISD::SREM);
- setTargetDAGCombine(ISD::UREM);
- // setcc for f16x2 needs special handling to prevent legalizer's
- // attempt to scalarize it due to v2i1 not being legal.
- if (STI.allowFP16Math())
- setTargetDAGCombine(ISD::SETCC);
- // Promote fp16 arithmetic if fp16 hardware isn't available or the
- // user passed --nvptx-no-fp16-math. The flag is useful because,
- // although sm_53+ GPUs have some sort of FP16 support in
- // hardware, only sm_53 and sm_60 have full implementation. Others
- // only have token amount of hardware and are likely to run faster
- // by using fp32 units instead.
- for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
- setFP16OperationAction(Op, MVT::f16, Legal, Promote);
- setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
- }
- // There's no neg.f16 instruction. Expand to (0-x).
- setOperationAction(ISD::FNEG, MVT::f16, Expand);
- setOperationAction(ISD::FNEG, MVT::v2f16, Expand);
- // (would be) Library functions.
- // These map to conversion instructions for scalar FP types.
- for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
- ISD::FTRUNC}) {
- setOperationAction(Op, MVT::f16, Legal);
- setOperationAction(Op, MVT::f32, Legal);
- setOperationAction(Op, MVT::f64, Legal);
- setOperationAction(Op, MVT::v2f16, Expand);
- }
- setOperationAction(ISD::FROUND, MVT::f16, Promote);
- setOperationAction(ISD::FROUND, MVT::v2f16, Expand);
- setOperationAction(ISD::FROUND, MVT::f32, Custom);
- setOperationAction(ISD::FROUND, MVT::f64, Custom);
- // 'Expand' implements FCOPYSIGN without calling an external library.
- setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
- setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);
- setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
- setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
- // These map to corresponding instructions for f32/f64. f16 must be
- // promoted to f32. v2f16 is expanded to f16, which is then promoted
- // to f32.
- for (const auto &Op :
- {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FABS}) {
- setOperationAction(Op, MVT::f16, Promote);
- setOperationAction(Op, MVT::f32, Legal);
- setOperationAction(Op, MVT::f64, Legal);
- setOperationAction(Op, MVT::v2f16, Expand);
- }
- // max.f16, max.f16x2 and max.NaN are supported on sm_80+.
- auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) {
- bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
- return IsAtLeastSm80 ? Legal : NotSm80Action;
- };
- for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
- setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote);
- setOperationAction(Op, MVT::f32, Legal);
- setOperationAction(Op, MVT::f64, Legal);
- setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
- }
- for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
- setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand);
- setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand));
- setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
- }
- // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
- // No FPOW or FREM in PTX.
- // Now deduce the information based on the above mentioned
- // actions
- computeRegisterProperties(STI.getRegisterInfo());
- }
- const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
- switch ((NVPTXISD::NodeType)Opcode) {
- case NVPTXISD::FIRST_NUMBER:
- break;
- case NVPTXISD::CALL:
- return "NVPTXISD::CALL";
- case NVPTXISD::RET_FLAG:
- return "NVPTXISD::RET_FLAG";
- case NVPTXISD::LOAD_PARAM:
- return "NVPTXISD::LOAD_PARAM";
- case NVPTXISD::Wrapper:
- return "NVPTXISD::Wrapper";
- case NVPTXISD::DeclareParam:
- return "NVPTXISD::DeclareParam";
- case NVPTXISD::DeclareScalarParam:
- return "NVPTXISD::DeclareScalarParam";
- case NVPTXISD::DeclareRet:
- return "NVPTXISD::DeclareRet";
- case NVPTXISD::DeclareScalarRet:
- return "NVPTXISD::DeclareScalarRet";
- case NVPTXISD::DeclareRetParam:
- return "NVPTXISD::DeclareRetParam";
- case NVPTXISD::PrintCall:
- return "NVPTXISD::PrintCall";
- case NVPTXISD::PrintConvergentCall:
- return "NVPTXISD::PrintConvergentCall";
- case NVPTXISD::PrintCallUni:
- return "NVPTXISD::PrintCallUni";
- case NVPTXISD::PrintConvergentCallUni:
- return "NVPTXISD::PrintConvergentCallUni";
- case NVPTXISD::LoadParam:
- return "NVPTXISD::LoadParam";
- case NVPTXISD::LoadParamV2:
- return "NVPTXISD::LoadParamV2";
- case NVPTXISD::LoadParamV4:
- return "NVPTXISD::LoadParamV4";
- case NVPTXISD::StoreParam:
- return "NVPTXISD::StoreParam";
- case NVPTXISD::StoreParamV2:
- return "NVPTXISD::StoreParamV2";
- case NVPTXISD::StoreParamV4:
- return "NVPTXISD::StoreParamV4";
- case NVPTXISD::StoreParamS32:
- return "NVPTXISD::StoreParamS32";
- case NVPTXISD::StoreParamU32:
- return "NVPTXISD::StoreParamU32";
- case NVPTXISD::CallArgBegin:
- return "NVPTXISD::CallArgBegin";
- case NVPTXISD::CallArg:
- return "NVPTXISD::CallArg";
- case NVPTXISD::LastCallArg:
- return "NVPTXISD::LastCallArg";
- case NVPTXISD::CallArgEnd:
- return "NVPTXISD::CallArgEnd";
- case NVPTXISD::CallVoid:
- return "NVPTXISD::CallVoid";
- case NVPTXISD::CallVal:
- return "NVPTXISD::CallVal";
- case NVPTXISD::CallSymbol:
- return "NVPTXISD::CallSymbol";
- case NVPTXISD::Prototype:
- return "NVPTXISD::Prototype";
- case NVPTXISD::MoveParam:
- return "NVPTXISD::MoveParam";
- case NVPTXISD::StoreRetval:
- return "NVPTXISD::StoreRetval";
- case NVPTXISD::StoreRetvalV2:
- return "NVPTXISD::StoreRetvalV2";
- case NVPTXISD::StoreRetvalV4:
- return "NVPTXISD::StoreRetvalV4";
- case NVPTXISD::PseudoUseParam:
- return "NVPTXISD::PseudoUseParam";
- case NVPTXISD::RETURN:
- return "NVPTXISD::RETURN";
- case NVPTXISD::CallSeqBegin:
- return "NVPTXISD::CallSeqBegin";
- case NVPTXISD::CallSeqEnd:
- return "NVPTXISD::CallSeqEnd";
- case NVPTXISD::CallPrototype:
- return "NVPTXISD::CallPrototype";
- case NVPTXISD::ProxyReg:
- return "NVPTXISD::ProxyReg";
- case NVPTXISD::LoadV2:
- return "NVPTXISD::LoadV2";
- case NVPTXISD::LoadV4:
- return "NVPTXISD::LoadV4";
- case NVPTXISD::LDGV2:
- return "NVPTXISD::LDGV2";
- case NVPTXISD::LDGV4:
- return "NVPTXISD::LDGV4";
- case NVPTXISD::LDUV2:
- return "NVPTXISD::LDUV2";
- case NVPTXISD::LDUV4:
- return "NVPTXISD::LDUV4";
- case NVPTXISD::StoreV2:
- return "NVPTXISD::StoreV2";
- case NVPTXISD::StoreV4:
- return "NVPTXISD::StoreV4";
- case NVPTXISD::FUN_SHFL_CLAMP:
- return "NVPTXISD::FUN_SHFL_CLAMP";
- case NVPTXISD::FUN_SHFR_CLAMP:
- return "NVPTXISD::FUN_SHFR_CLAMP";
- case NVPTXISD::IMAD:
- return "NVPTXISD::IMAD";
- case NVPTXISD::SETP_F16X2:
- return "NVPTXISD::SETP_F16X2";
- case NVPTXISD::Dummy:
- return "NVPTXISD::Dummy";
- case NVPTXISD::MUL_WIDE_SIGNED:
- return "NVPTXISD::MUL_WIDE_SIGNED";
- case NVPTXISD::MUL_WIDE_UNSIGNED:
- return "NVPTXISD::MUL_WIDE_UNSIGNED";
- case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32";
- case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat";
- case NVPTXISD::Tex1DFloatFloatLevel:
- return "NVPTXISD::Tex1DFloatFloatLevel";
- case NVPTXISD::Tex1DFloatFloatGrad:
- return "NVPTXISD::Tex1DFloatFloatGrad";
- case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32";
- case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float";
- case NVPTXISD::Tex1DS32FloatLevel:
- return "NVPTXISD::Tex1DS32FloatLevel";
- case NVPTXISD::Tex1DS32FloatGrad:
- return "NVPTXISD::Tex1DS32FloatGrad";
- case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32";
- case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float";
- case NVPTXISD::Tex1DU32FloatLevel:
- return "NVPTXISD::Tex1DU32FloatLevel";
- case NVPTXISD::Tex1DU32FloatGrad:
- return "NVPTXISD::Tex1DU32FloatGrad";
- case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32";
- case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
- case NVPTXISD::Tex1DArrayFloatFloatLevel:
- return "NVPTXISD::Tex1DArrayFloatFloatLevel";
- case NVPTXISD::Tex1DArrayFloatFloatGrad:
- return "NVPTXISD::Tex1DArrayFloatFloatGrad";
- case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32";
- case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float";
- case NVPTXISD::Tex1DArrayS32FloatLevel:
- return "NVPTXISD::Tex1DArrayS32FloatLevel";
- case NVPTXISD::Tex1DArrayS32FloatGrad:
- return "NVPTXISD::Tex1DArrayS32FloatGrad";
- case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32";
- case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float";
- case NVPTXISD::Tex1DArrayU32FloatLevel:
- return "NVPTXISD::Tex1DArrayU32FloatLevel";
- case NVPTXISD::Tex1DArrayU32FloatGrad:
- return "NVPTXISD::Tex1DArrayU32FloatGrad";
- case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32";
- case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat";
- case NVPTXISD::Tex2DFloatFloatLevel:
- return "NVPTXISD::Tex2DFloatFloatLevel";
- case NVPTXISD::Tex2DFloatFloatGrad:
- return "NVPTXISD::Tex2DFloatFloatGrad";
- case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32";
- case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float";
- case NVPTXISD::Tex2DS32FloatLevel:
- return "NVPTXISD::Tex2DS32FloatLevel";
- case NVPTXISD::Tex2DS32FloatGrad:
- return "NVPTXISD::Tex2DS32FloatGrad";
- case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32";
- case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float";
- case NVPTXISD::Tex2DU32FloatLevel:
- return "NVPTXISD::Tex2DU32FloatLevel";
- case NVPTXISD::Tex2DU32FloatGrad:
- return "NVPTXISD::Tex2DU32FloatGrad";
- case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32";
- case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
- case NVPTXISD::Tex2DArrayFloatFloatLevel:
- return "NVPTXISD::Tex2DArrayFloatFloatLevel";
- case NVPTXISD::Tex2DArrayFloatFloatGrad:
- return "NVPTXISD::Tex2DArrayFloatFloatGrad";
- case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32";
- case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float";
- case NVPTXISD::Tex2DArrayS32FloatLevel:
- return "NVPTXISD::Tex2DArrayS32FloatLevel";
- case NVPTXISD::Tex2DArrayS32FloatGrad:
- return "NVPTXISD::Tex2DArrayS32FloatGrad";
- case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32";
- case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float";
- case NVPTXISD::Tex2DArrayU32FloatLevel:
- return "NVPTXISD::Tex2DArrayU32FloatLevel";
- case NVPTXISD::Tex2DArrayU32FloatGrad:
- return "NVPTXISD::Tex2DArrayU32FloatGrad";
- case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32";
- case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat";
- case NVPTXISD::Tex3DFloatFloatLevel:
- return "NVPTXISD::Tex3DFloatFloatLevel";
- case NVPTXISD::Tex3DFloatFloatGrad:
- return "NVPTXISD::Tex3DFloatFloatGrad";
- case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32";
- case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float";
- case NVPTXISD::Tex3DS32FloatLevel:
- return "NVPTXISD::Tex3DS32FloatLevel";
- case NVPTXISD::Tex3DS32FloatGrad:
- return "NVPTXISD::Tex3DS32FloatGrad";
- case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32";
- case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float";
- case NVPTXISD::Tex3DU32FloatLevel:
- return "NVPTXISD::Tex3DU32FloatLevel";
- case NVPTXISD::Tex3DU32FloatGrad:
- return "NVPTXISD::Tex3DU32FloatGrad";
- case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat";
- case NVPTXISD::TexCubeFloatFloatLevel:
- return "NVPTXISD::TexCubeFloatFloatLevel";
- case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float";
- case NVPTXISD::TexCubeS32FloatLevel:
- return "NVPTXISD::TexCubeS32FloatLevel";
- case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float";
- case NVPTXISD::TexCubeU32FloatLevel:
- return "NVPTXISD::TexCubeU32FloatLevel";
- case NVPTXISD::TexCubeArrayFloatFloat:
- return "NVPTXISD::TexCubeArrayFloatFloat";
- case NVPTXISD::TexCubeArrayFloatFloatLevel:
- return "NVPTXISD::TexCubeArrayFloatFloatLevel";
- case NVPTXISD::TexCubeArrayS32Float:
- return "NVPTXISD::TexCubeArrayS32Float";
- case NVPTXISD::TexCubeArrayS32FloatLevel:
- return "NVPTXISD::TexCubeArrayS32FloatLevel";
- case NVPTXISD::TexCubeArrayU32Float:
- return "NVPTXISD::TexCubeArrayU32Float";
- case NVPTXISD::TexCubeArrayU32FloatLevel:
- return "NVPTXISD::TexCubeArrayU32FloatLevel";
- case NVPTXISD::Tld4R2DFloatFloat:
- return "NVPTXISD::Tld4R2DFloatFloat";
- case NVPTXISD::Tld4G2DFloatFloat:
- return "NVPTXISD::Tld4G2DFloatFloat";
- case NVPTXISD::Tld4B2DFloatFloat:
- return "NVPTXISD::Tld4B2DFloatFloat";
- case NVPTXISD::Tld4A2DFloatFloat:
- return "NVPTXISD::Tld4A2DFloatFloat";
- case NVPTXISD::Tld4R2DS64Float:
- return "NVPTXISD::Tld4R2DS64Float";
- case NVPTXISD::Tld4G2DS64Float:
- return "NVPTXISD::Tld4G2DS64Float";
- case NVPTXISD::Tld4B2DS64Float:
- return "NVPTXISD::Tld4B2DS64Float";
- case NVPTXISD::Tld4A2DS64Float:
- return "NVPTXISD::Tld4A2DS64Float";
- case NVPTXISD::Tld4R2DU64Float:
- return "NVPTXISD::Tld4R2DU64Float";
- case NVPTXISD::Tld4G2DU64Float:
- return "NVPTXISD::Tld4G2DU64Float";
- case NVPTXISD::Tld4B2DU64Float:
- return "NVPTXISD::Tld4B2DU64Float";
- case NVPTXISD::Tld4A2DU64Float:
- return "NVPTXISD::Tld4A2DU64Float";
- case NVPTXISD::TexUnified1DFloatS32:
- return "NVPTXISD::TexUnified1DFloatS32";
- case NVPTXISD::TexUnified1DFloatFloat:
- return "NVPTXISD::TexUnified1DFloatFloat";
- case NVPTXISD::TexUnified1DFloatFloatLevel:
- return "NVPTXISD::TexUnified1DFloatFloatLevel";
- case NVPTXISD::TexUnified1DFloatFloatGrad:
- return "NVPTXISD::TexUnified1DFloatFloatGrad";
- case NVPTXISD::TexUnified1DS32S32:
- return "NVPTXISD::TexUnified1DS32S32";
- case NVPTXISD::TexUnified1DS32Float:
- return "NVPTXISD::TexUnified1DS32Float";
- case NVPTXISD::TexUnified1DS32FloatLevel:
- return "NVPTXISD::TexUnified1DS32FloatLevel";
- case NVPTXISD::TexUnified1DS32FloatGrad:
- return "NVPTXISD::TexUnified1DS32FloatGrad";
- case NVPTXISD::TexUnified1DU32S32:
- return "NVPTXISD::TexUnified1DU32S32";
- case NVPTXISD::TexUnified1DU32Float:
- return "NVPTXISD::TexUnified1DU32Float";
- case NVPTXISD::TexUnified1DU32FloatLevel:
- return "NVPTXISD::TexUnified1DU32FloatLevel";
- case NVPTXISD::TexUnified1DU32FloatGrad:
- return "NVPTXISD::TexUnified1DU32FloatGrad";
- case NVPTXISD::TexUnified1DArrayFloatS32:
- return "NVPTXISD::TexUnified1DArrayFloatS32";
- case NVPTXISD::TexUnified1DArrayFloatFloat:
- return "NVPTXISD::TexUnified1DArrayFloatFloat";
- case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
- return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
- case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
- return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
- case NVPTXISD::TexUnified1DArrayS32S32:
- return "NVPTXISD::TexUnified1DArrayS32S32";
- case NVPTXISD::TexUnified1DArrayS32Float:
- return "NVPTXISD::TexUnified1DArrayS32Float";
- case NVPTXISD::TexUnified1DArrayS32FloatLevel:
- return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
- case NVPTXISD::TexUnified1DArrayS32FloatGrad:
- return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
- case NVPTXISD::TexUnified1DArrayU32S32:
- return "NVPTXISD::TexUnified1DArrayU32S32";
- case NVPTXISD::TexUnified1DArrayU32Float:
- return "NVPTXISD::TexUnified1DArrayU32Float";
- case NVPTXISD::TexUnified1DArrayU32FloatLevel:
- return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
- case NVPTXISD::TexUnified1DArrayU32FloatGrad:
- return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
- case NVPTXISD::TexUnified2DFloatS32:
- return "NVPTXISD::TexUnified2DFloatS32";
- case NVPTXISD::TexUnified2DFloatFloat:
- return "NVPTXISD::TexUnified2DFloatFloat";
- case NVPTXISD::TexUnified2DFloatFloatLevel:
- return "NVPTXISD::TexUnified2DFloatFloatLevel";
- case NVPTXISD::TexUnified2DFloatFloatGrad:
- return "NVPTXISD::TexUnified2DFloatFloatGrad";
- case NVPTXISD::TexUnified2DS32S32:
- return "NVPTXISD::TexUnified2DS32S32";
- case NVPTXISD::TexUnified2DS32Float:
- return "NVPTXISD::TexUnified2DS32Float";
- case NVPTXISD::TexUnified2DS32FloatLevel:
- return "NVPTXISD::TexUnified2DS32FloatLevel";
- case NVPTXISD::TexUnified2DS32FloatGrad:
- return "NVPTXISD::TexUnified2DS32FloatGrad";
- case NVPTXISD::TexUnified2DU32S32:
- return "NVPTXISD::TexUnified2DU32S32";
- case NVPTXISD::TexUnified2DU32Float:
- return "NVPTXISD::TexUnified2DU32Float";
- case NVPTXISD::TexUnified2DU32FloatLevel:
- return "NVPTXISD::TexUnified2DU32FloatLevel";
- case NVPTXISD::TexUnified2DU32FloatGrad:
- return "NVPTXISD::TexUnified2DU32FloatGrad";
- case NVPTXISD::TexUnified2DArrayFloatS32:
- return "NVPTXISD::TexUnified2DArrayFloatS32";
- case NVPTXISD::TexUnified2DArrayFloatFloat:
- return "NVPTXISD::TexUnified2DArrayFloatFloat";
- case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
- return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
- case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
- return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
- case NVPTXISD::TexUnified2DArrayS32S32:
- return "NVPTXISD::TexUnified2DArrayS32S32";
- case NVPTXISD::TexUnified2DArrayS32Float:
- return "NVPTXISD::TexUnified2DArrayS32Float";
- case NVPTXISD::TexUnified2DArrayS32FloatLevel:
- return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
- case NVPTXISD::TexUnified2DArrayS32FloatGrad:
- return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
- case NVPTXISD::TexUnified2DArrayU32S32:
- return "NVPTXISD::TexUnified2DArrayU32S32";
- case NVPTXISD::TexUnified2DArrayU32Float:
- return "NVPTXISD::TexUnified2DArrayU32Float";
- case NVPTXISD::TexUnified2DArrayU32FloatLevel:
- return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
- case NVPTXISD::TexUnified2DArrayU32FloatGrad:
- return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
- case NVPTXISD::TexUnified3DFloatS32:
- return "NVPTXISD::TexUnified3DFloatS32";
- case NVPTXISD::TexUnified3DFloatFloat:
- return "NVPTXISD::TexUnified3DFloatFloat";
- case NVPTXISD::TexUnified3DFloatFloatLevel:
- return "NVPTXISD::TexUnified3DFloatFloatLevel";
- case NVPTXISD::TexUnified3DFloatFloatGrad:
- return "NVPTXISD::TexUnified3DFloatFloatGrad";
- case NVPTXISD::TexUnified3DS32S32:
- return "NVPTXISD::TexUnified3DS32S32";
- case NVPTXISD::TexUnified3DS32Float:
- return "NVPTXISD::TexUnified3DS32Float";
- case NVPTXISD::TexUnified3DS32FloatLevel:
- return "NVPTXISD::TexUnified3DS32FloatLevel";
- case NVPTXISD::TexUnified3DS32FloatGrad:
- return "NVPTXISD::TexUnified3DS32FloatGrad";
- case NVPTXISD::TexUnified3DU32S32:
- return "NVPTXISD::TexUnified3DU32S32";
- case NVPTXISD::TexUnified3DU32Float:
- return "NVPTXISD::TexUnified3DU32Float";
- case NVPTXISD::TexUnified3DU32FloatLevel:
- return "NVPTXISD::TexUnified3DU32FloatLevel";
- case NVPTXISD::TexUnified3DU32FloatGrad:
- return "NVPTXISD::TexUnified3DU32FloatGrad";
- case NVPTXISD::TexUnifiedCubeFloatFloat:
- return "NVPTXISD::TexUnifiedCubeFloatFloat";
- case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
- return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
- case NVPTXISD::TexUnifiedCubeS32Float:
- return "NVPTXISD::TexUnifiedCubeS32Float";
- case NVPTXISD::TexUnifiedCubeS32FloatLevel:
- return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
- case NVPTXISD::TexUnifiedCubeU32Float:
- return "NVPTXISD::TexUnifiedCubeU32Float";
- case NVPTXISD::TexUnifiedCubeU32FloatLevel:
- return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
- case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
- return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
- case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
- return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
- case NVPTXISD::TexUnifiedCubeArrayS32Float:
- return "NVPTXISD::TexUnifiedCubeArrayS32Float";
- case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
- return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
- case NVPTXISD::TexUnifiedCubeArrayU32Float:
- return "NVPTXISD::TexUnifiedCubeArrayU32Float";
- case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
- return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
- case NVPTXISD::Tld4UnifiedR2DFloatFloat:
- return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
- case NVPTXISD::Tld4UnifiedG2DFloatFloat:
- return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
- case NVPTXISD::Tld4UnifiedB2DFloatFloat:
- return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
- case NVPTXISD::Tld4UnifiedA2DFloatFloat:
- return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
- case NVPTXISD::Tld4UnifiedR2DS64Float:
- return "NVPTXISD::Tld4UnifiedR2DS64Float";
- case NVPTXISD::Tld4UnifiedG2DS64Float:
- return "NVPTXISD::Tld4UnifiedG2DS64Float";
- case NVPTXISD::Tld4UnifiedB2DS64Float:
- return "NVPTXISD::Tld4UnifiedB2DS64Float";
- case NVPTXISD::Tld4UnifiedA2DS64Float:
- return "NVPTXISD::Tld4UnifiedA2DS64Float";
- case NVPTXISD::Tld4UnifiedR2DU64Float:
- return "NVPTXISD::Tld4UnifiedR2DU64Float";
- case NVPTXISD::Tld4UnifiedG2DU64Float:
- return "NVPTXISD::Tld4UnifiedG2DU64Float";
- case NVPTXISD::Tld4UnifiedB2DU64Float:
- return "NVPTXISD::Tld4UnifiedB2DU64Float";
- case NVPTXISD::Tld4UnifiedA2DU64Float:
- return "NVPTXISD::Tld4UnifiedA2DU64Float";
- case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp";
- case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp";
- case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp";
- case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp";
- case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp";
- case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp";
- case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp";
- case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp";
- case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp";
- case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp";
- case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp";
- case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp";
- case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp";
- case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp";
- case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp";
- case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
- case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
- case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
- case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
- case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
- case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
- case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
- case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp";
- case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp";
- case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp";
- case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp";
- case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp";
- case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp";
- case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp";
- case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp";
- case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp";
- case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp";
- case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp";
- case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp";
- case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp";
- case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp";
- case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp";
- case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
- case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
- case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
- case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
- case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
- case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
- case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
- case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp";
- case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp";
- case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp";
- case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp";
- case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp";
- case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp";
- case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp";
- case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp";
- case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp";
- case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp";
- case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp";
- case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap";
- case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap";
- case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap";
- case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap";
- case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap";
- case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap";
- case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap";
- case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap";
- case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap";
- case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap";
- case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap";
- case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap";
- case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap";
- case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap";
- case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap";
- case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap";
- case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap";
- case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap";
- case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap";
- case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap";
- case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap";
- case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap";
- case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap";
- case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap";
- case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap";
- case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap";
- case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap";
- case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap";
- case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap";
- case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap";
- case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap";
- case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap";
- case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap";
- case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap";
- case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap";
- case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap";
- case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap";
- case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap";
- case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap";
- case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap";
- case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap";
- case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap";
- case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap";
- case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap";
- case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap";
- case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap";
- case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap";
- case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap";
- case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap";
- case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap";
- case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap";
- case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap";
- case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap";
- case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap";
- case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap";
- case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero";
- case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero";
- case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero";
- case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero";
- case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero";
- case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero";
- case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero";
- case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero";
- case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero";
- case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero";
- case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero";
- case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero";
- case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero";
- case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero";
- case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero";
- case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero";
- case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero";
- case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero";
- case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero";
- case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero";
- case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero";
- case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero";
- case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero";
- case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero";
- case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero";
- case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero";
- case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero";
- case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero";
- case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero";
- case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero";
- case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero";
- case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero";
- case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero";
- case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero";
- case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero";
- case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero";
- case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero";
- case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero";
- case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero";
- case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero";
- case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero";
- case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero";
- case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero";
- case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero";
- case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero";
- case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero";
- case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero";
- case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero";
- case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero";
- case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero";
- case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero";
- case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero";
- case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero";
- case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero";
- case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero";
- }
- return nullptr;
- }
- TargetLoweringBase::LegalizeTypeAction
- NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const {
- if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
- VT.getScalarType() == MVT::i1)
- return TypeSplitVector;
- if (VT == MVT::v2f16)
- return TypeLegal;
- return TargetLoweringBase::getPreferredVectorAction(VT);
- }
- SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
- int Enabled, int &ExtraSteps,
- bool &UseOneConst,
- bool Reciprocal) const {
- if (!(Enabled == ReciprocalEstimate::Enabled ||
- (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
- return SDValue();
- if (ExtraSteps == ReciprocalEstimate::Unspecified)
- ExtraSteps = 0;
- SDLoc DL(Operand);
- EVT VT = Operand.getValueType();
- bool Ftz = useF32FTZ(DAG.getMachineFunction());
- auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
- DAG.getConstant(IID, DL, MVT::i32), Operand);
- };
- // The sqrt and rsqrt refinement processes assume we always start out with an
- // approximation of the rsqrt. Therefore, if we're going to do any refinement
- // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
- // any refinement, we must return a regular sqrt.
- if (Reciprocal || ExtraSteps > 0) {
- if (VT == MVT::f32)
- return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
- : Intrinsic::nvvm_rsqrt_approx_f);
- else if (VT == MVT::f64)
- return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
- else
- return SDValue();
- } else {
- if (VT == MVT::f32)
- return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
- : Intrinsic::nvvm_sqrt_approx_f);
- else {
- // There's no sqrt.approx.f64 instruction, so we emit
- // reciprocal(rsqrt(x)). This is faster than
- // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
- // x * rsqrt(x).)
- return DAG.getNode(
- ISD::INTRINSIC_WO_CHAIN, DL, VT,
- DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
- MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
- }
- }
- }
- SDValue
- NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
- SDLoc dl(Op);
- const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
- auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
- Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
- return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
- }
- std::string NVPTXTargetLowering::getPrototype(
- const DataLayout &DL, Type *retTy, const ArgListTy &Args,
- const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
- const CallBase &CB, unsigned UniqueCallSite) const {
- auto PtrVT = getPointerTy(DL);
- bool isABI = (STI.getSmVersion() >= 20);
- assert(isABI && "Non-ABI compilation is not supported");
- if (!isABI)
- return "";
- std::stringstream O;
- O << "prototype_" << UniqueCallSite << " : .callprototype ";
- if (retTy->getTypeID() == Type::VoidTyID) {
- O << "()";
- } else {
- O << "(";
- if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) {
- unsigned size = 0;
- if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
- size = ITy->getBitWidth();
- } else {
- assert(retTy->isFloatingPointTy() &&
- "Floating point type expected here");
- size = retTy->getPrimitiveSizeInBits();
- }
- // PTX ABI requires all scalar return values to be at least 32
- // bits in size. fp16 normally uses .b16 as its storage type in
- // PTX, so its size must be adjusted here, too.
- if (size < 32)
- size = 32;
- O << ".param .b" << size << " _";
- } else if (isa<PointerType>(retTy)) {
- O << ".param .b" << PtrVT.getSizeInBits() << " _";
- } else if (retTy->isAggregateType() || retTy->isVectorTy() ||
- retTy->isIntegerTy(128)) {
- O << ".param .align " << (retAlignment ? retAlignment->value() : 0)
- << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]";
- } else {
- llvm_unreachable("Unknown return type");
- }
- O << ") ";
- }
- O << "_ (";
- bool first = true;
- unsigned OIdx = 0;
- for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
- Type *Ty = Args[i].Ty;
- if (!first) {
- O << ", ";
- }
- first = false;
- if (!Outs[OIdx].Flags.isByVal()) {
- if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
- unsigned align = 0;
- const CallInst *CallI = cast<CallInst>(&CB);
- // +1 because index 0 is reserved for return type alignment
- if (!getAlign(*CallI, i + 1, align))
- align = DL.getABITypeAlignment(Ty);
- unsigned sz = DL.getTypeAllocSize(Ty);
- O << ".param .align " << align << " .b8 ";
- O << "_";
- O << "[" << sz << "]";
- // update the index for Outs
- SmallVector<EVT, 16> vtparts;
- ComputeValueVTs(*this, DL, Ty, vtparts);
- if (unsigned len = vtparts.size())
- OIdx += len - 1;
- continue;
- }
- // i8 types in IR will be i16 types in SDAG
- assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
- (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
- "type mismatch between callee prototype and arguments");
- // scalar type
- unsigned sz = 0;
- if (isa<IntegerType>(Ty)) {
- sz = cast<IntegerType>(Ty)->getBitWidth();
- if (sz < 32)
- sz = 32;
- } else if (isa<PointerType>(Ty)) {
- sz = PtrVT.getSizeInBits();
- } else if (Ty->isHalfTy())
- // PTX ABI requires all scalar parameters to be at least 32
- // bits in size. fp16 normally uses .b16 as its storage type
- // in PTX, so its size must be adjusted here, too.
- sz = 32;
- else
- sz = Ty->getPrimitiveSizeInBits();
- O << ".param .b" << sz << " ";
- O << "_";
- continue;
- }
- auto *PTy = dyn_cast<PointerType>(Ty);
- assert(PTy && "Param with byval attribute should be a pointer type");
- Type *ETy = PTy->getPointerElementType();
- Align align = Outs[OIdx].Flags.getNonZeroByValAlign();
- unsigned sz = DL.getTypeAllocSize(ETy);
- O << ".param .align " << align.value() << " .b8 ";
- O << "_";
- O << "[" << sz << "]";
- }
- O << ");";
- return O.str();
- }
- Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
- const CallBase *CB, Type *Ty,
- unsigned Idx,
- const DataLayout &DL) const {
- if (!CB) {
- // CallSite is zero, fallback to ABI type alignment
- return DL.getABITypeAlign(Ty);
- }
- unsigned Alignment = 0;
- const Function *DirectCallee = CB->getCalledFunction();
- if (!DirectCallee) {
- // We don't have a direct function symbol, but that may be because of
- // constant cast instructions in the call.
- // With bitcast'd call targets, the instruction will be the call
- if (const auto *CI = dyn_cast<CallInst>(CB)) {
- // Check if we have call alignment metadata
- if (getAlign(*CI, Idx, Alignment))
- return Align(Alignment);
- const Value *CalleeV = CI->getCalledOperand();
- // Ignore any bitcast instructions
- while (isa<ConstantExpr>(CalleeV)) {
- const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
- if (!CE->isCast())
- break;
- // Look through the bitcast
- CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
- }
- // We have now looked past all of the bitcasts. Do we finally have a
- // Function?
- if (const auto *CalleeF = dyn_cast<Function>(CalleeV))
- DirectCallee = CalleeF;
- }
- }
- // Check for function alignment information if we found that the
- // ultimate target is a Function
- if (DirectCallee)
- if (getAlign(*DirectCallee, Idx, Alignment))
- return Align(Alignment);
- // Call is indirect or alignment information is not available, fall back to
- // the ABI type alignment
- return DL.getABITypeAlign(Ty);
- }
- SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
- SmallVectorImpl<SDValue> &InVals) const {
- SelectionDAG &DAG = CLI.DAG;
- SDLoc dl = CLI.DL;
- SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
- SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
- SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
- SDValue Chain = CLI.Chain;
- SDValue Callee = CLI.Callee;
- bool &isTailCall = CLI.IsTailCall;
- ArgListTy &Args = CLI.getArgs();
- Type *RetTy = CLI.RetTy;
- const CallBase *CB = CLI.CB;
- const DataLayout &DL = DAG.getDataLayout();
- bool isABI = (STI.getSmVersion() >= 20);
- assert(isABI && "Non-ABI compilation is not supported");
- if (!isABI)
- return Chain;
- unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1);
- SDValue tempChain = Chain;
- Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
- SDValue InFlag = Chain.getValue(1);
- unsigned paramCount = 0;
- // Args.size() and Outs.size() need not match.
- // Outs.size() will be larger
- // * if there is an aggregate argument with multiple fields (each field
- // showing up separately in Outs)
- // * if there is a vector argument with more than typical vector-length
- // elements (generally if more than 4) where each vector element is
- // individually present in Outs.
- // So a different index should be used for indexing into Outs/OutVals.
- // See similar issue in LowerFormalArguments.
- unsigned OIdx = 0;
- // Declare the .params or .reg need to pass values
- // to the function
- for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
- EVT VT = Outs[OIdx].VT;
- Type *Ty = Args[i].Ty;
- if (!Outs[OIdx].Flags.isByVal()) {
- SmallVector<EVT, 16> VTs;
- SmallVector<uint64_t, 16> Offsets;
- ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets);
- Align ArgAlign = getArgumentAlignment(Callee, CB, Ty, paramCount + 1, DL);
- unsigned AllocSize = DL.getTypeAllocSize(Ty);
- SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- bool NeedAlign; // Does argument declaration specify alignment?
- if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
- // declare .param .align <align> .b8 .param<n>[<size>];
- SDValue DeclareParamOps[] = {
- Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
- DAG.getConstant(paramCount, dl, MVT::i32),
- DAG.getConstant(AllocSize, dl, MVT::i32), InFlag};
- Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
- DeclareParamOps);
- NeedAlign = true;
- } else {
- // declare .param .b<size> .param<n>;
- if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) {
- // PTX ABI requires integral types to be at least 32 bits in
- // size. FP16 is loaded/stored using i16, so it's handled
- // here as well.
- AllocSize = 4;
- }
- SDValue DeclareScalarParamOps[] = {
- Chain, DAG.getConstant(paramCount, dl, MVT::i32),
- DAG.getConstant(AllocSize * 8, dl, MVT::i32),
- DAG.getConstant(0, dl, MVT::i32), InFlag};
- Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
- DeclareScalarParamOps);
- NeedAlign = false;
- }
- InFlag = Chain.getValue(1);
- // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
- // than 32-bits are sign extended or zero extended, depending on
- // whether they are signed or unsigned types. This case applies
- // only to scalar parameters and not to aggregate values.
- bool ExtendIntegerParam =
- Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
- auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
- SmallVector<SDValue, 6> StoreOperands;
- for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
- // New store.
- if (VectorInfo[j] & PVF_FIRST) {
- assert(StoreOperands.empty() && "Unfinished preceding store.");
- StoreOperands.push_back(Chain);
- StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
- StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32));
- }
- EVT EltVT = VTs[j];
- SDValue StVal = OutVals[OIdx];
- if (ExtendIntegerParam) {
- assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
- // zext/sext to i32
- StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
- : ISD::ZERO_EXTEND,
- dl, MVT::i32, StVal);
- } else if (EltVT.getSizeInBits() < 16) {
- // Use 16-bit registers for small stores as it's the
- // smallest general purpose register size supported by NVPTX.
- StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
- }
- // Record the value to store.
- StoreOperands.push_back(StVal);
- if (VectorInfo[j] & PVF_LAST) {
- unsigned NumElts = StoreOperands.size() - 3;
- NVPTXISD::NodeType Op;
- switch (NumElts) {
- case 1:
- Op = NVPTXISD::StoreParam;
- break;
- case 2:
- Op = NVPTXISD::StoreParamV2;
- break;
- case 4:
- Op = NVPTXISD::StoreParamV4;
- break;
- default:
- llvm_unreachable("Invalid vector info.");
- }
- StoreOperands.push_back(InFlag);
- // Adjust type of the store op if we've extended the scalar
- // return value.
- EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j];
- MaybeAlign EltAlign;
- if (NeedAlign)
- EltAlign = commonAlignment(ArgAlign, Offsets[j]);
- Chain = DAG.getMemIntrinsicNode(
- Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
- TheStoreType, MachinePointerInfo(), EltAlign,
- MachineMemOperand::MOStore);
- InFlag = Chain.getValue(1);
- // Cleanup.
- StoreOperands.clear();
- }
- ++OIdx;
- }
- assert(StoreOperands.empty() && "Unfinished parameter store.");
- if (VTs.size() > 0)
- --OIdx;
- ++paramCount;
- continue;
- }
- // ByVal arguments
- SmallVector<EVT, 16> VTs;
- SmallVector<uint64_t, 16> Offsets;
- auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
- assert(PTy && "Type of a byval parameter should be pointer");
- ComputePTXValueVTs(*this, DL, PTy->getPointerElementType(), VTs, &Offsets,
- 0);
- // declare .param .align <align> .b8 .param<n>[<size>];
- unsigned sz = Outs[OIdx].Flags.getByValSize();
- SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- Align ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
- // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
- // so we don't need to worry about natural alignment or not.
- // See TargetLowering::LowerCallTo().
- // Enforce minumum alignment of 4 to work around ptxas miscompile
- // for sm_50+. See corresponding alignment adjustment in
- // emitFunctionParamList() for details.
- if (ArgAlign < Align(4))
- ArgAlign = Align(4);
- SDValue DeclareParamOps[] = {
- Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
- DAG.getConstant(paramCount, dl, MVT::i32),
- DAG.getConstant(sz, dl, MVT::i32), InFlag};
- Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
- DeclareParamOps);
- InFlag = Chain.getValue(1);
- for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
- EVT elemtype = VTs[j];
- int curOffset = Offsets[j];
- unsigned PartAlign = GreatestCommonDivisor64(ArgAlign.value(), curOffset);
- auto PtrVT = getPointerTy(DL);
- SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
- DAG.getConstant(curOffset, dl, PtrVT));
- SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
- MachinePointerInfo(), PartAlign);
- if (elemtype.getSizeInBits() < 16) {
- theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
- }
- SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue CopyParamOps[] = { Chain,
- DAG.getConstant(paramCount, dl, MVT::i32),
- DAG.getConstant(curOffset, dl, MVT::i32),
- theVal, InFlag };
- Chain = DAG.getMemIntrinsicNode(
- NVPTXISD::StoreParam, dl, CopyParamVTs, CopyParamOps, elemtype,
- MachinePointerInfo(), /* Align */ None, MachineMemOperand::MOStore);
- InFlag = Chain.getValue(1);
- }
- ++paramCount;
- }
- GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
- MaybeAlign retAlignment = None;
- // Handle Result
- if (Ins.size() > 0) {
- SmallVector<EVT, 16> resvtparts;
- ComputeValueVTs(*this, DL, RetTy, resvtparts);
- // Declare
- // .param .align 16 .b8 retval0[<size-in-bytes>], or
- // .param .b<size-in-bits> retval0
- unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
- // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
- // these three types to match the logic in
- // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
- // Plus, this behavior is consistent with nvcc's.
- if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() ||
- (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) {
- // Scalar needs to be at least 32bit wide
- if (resultsz < 32)
- resultsz = 32;
- SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
- DAG.getConstant(resultsz, dl, MVT::i32),
- DAG.getConstant(0, dl, MVT::i32), InFlag };
- Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
- DeclareRetOps);
- InFlag = Chain.getValue(1);
- } else {
- retAlignment = getArgumentAlignment(Callee, CB, RetTy, 0, DL);
- assert(retAlignment && "retAlignment is guaranteed to be set");
- SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue DeclareRetOps[] = {
- Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),
- DAG.getConstant(resultsz / 8, dl, MVT::i32),
- DAG.getConstant(0, dl, MVT::i32), InFlag};
- Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
- DeclareRetOps);
- InFlag = Chain.getValue(1);
- }
- }
- // Both indirect calls and libcalls have nullptr Func. In order to distinguish
- // between them we must rely on the call site value which is valid for
- // indirect calls but is always null for libcalls.
- bool isIndirectCall = !Func && CB;
- if (isa<ExternalSymbolSDNode>(Callee)) {
- Function* CalleeFunc = nullptr;
- // Try to find the callee in the current module.
- Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
- assert(CalleeFunc != nullptr && "Libcall callee must be set.");
- // Set the "libcall callee" attribute to indicate that the function
- // must always have a declaration.
- CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
- }
- if (isIndirectCall) {
- // This is indirect function call case : PTX requires a prototype of the
- // form
- // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
- // to be emitted, and the label has to used as the last arg of call
- // instruction.
- // The prototype is embedded in a string and put as the operand for a
- // CallPrototype SDNode which will print out to the value of the string.
- SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- std::string Proto =
- getPrototype(DL, RetTy, Args, Outs, retAlignment, *CB, UniqueCallSite);
- const char *ProtoStr =
- nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
- SDValue ProtoOps[] = {
- Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
- };
- Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
- InFlag = Chain.getValue(1);
- }
- // Op to just print "call"
- SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue PrintCallOps[] = {
- Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
- };
- // We model convergent calls as separate opcodes.
- unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni;
- if (CLI.IsConvergent)
- Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
- : NVPTXISD::PrintConvergentCall;
- Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
- InFlag = Chain.getValue(1);
- // Ops to print out the function name
- SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue CallVoidOps[] = { Chain, Callee, InFlag };
- Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
- InFlag = Chain.getValue(1);
- // Ops to print out the param list
- SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue CallArgBeginOps[] = { Chain, InFlag };
- Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
- CallArgBeginOps);
- InFlag = Chain.getValue(1);
- for (unsigned i = 0, e = paramCount; i != e; ++i) {
- unsigned opcode;
- if (i == (e - 1))
- opcode = NVPTXISD::LastCallArg;
- else
- opcode = NVPTXISD::CallArg;
- SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
- DAG.getConstant(i, dl, MVT::i32), InFlag };
- Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
- InFlag = Chain.getValue(1);
- }
- SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue CallArgEndOps[] = { Chain,
- DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
- InFlag };
- Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
- InFlag = Chain.getValue(1);
- if (isIndirectCall) {
- SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue PrototypeOps[] = {
- Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InFlag};
- Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
- InFlag = Chain.getValue(1);
- }
- SmallVector<SDValue, 16> ProxyRegOps;
- SmallVector<Optional<MVT>, 16> ProxyRegTruncates;
- // Generate loads from param memory/moves from registers for result
- if (Ins.size() > 0) {
- SmallVector<EVT, 16> VTs;
- SmallVector<uint64_t, 16> Offsets;
- ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
- assert(VTs.size() == Ins.size() && "Bad value decomposition");
- Align RetAlign = getArgumentAlignment(Callee, CB, RetTy, 0, DL);
- auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
- SmallVector<EVT, 6> LoadVTs;
- int VecIdx = -1; // Index of the first element of the vector.
- // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
- // 32-bits are sign extended or zero extended, depending on whether
- // they are signed or unsigned types.
- bool ExtendIntegerRetVal =
- RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
- for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
- bool needTruncate = false;
- EVT TheLoadType = VTs[i];
- EVT EltType = Ins[i].VT;
- Align EltAlign = commonAlignment(RetAlign, Offsets[i]);
- if (ExtendIntegerRetVal) {
- TheLoadType = MVT::i32;
- EltType = MVT::i32;
- needTruncate = true;
- } else if (TheLoadType.getSizeInBits() < 16) {
- if (VTs[i].isInteger())
- needTruncate = true;
- EltType = MVT::i16;
- }
- // Record index of the very first element of the vector.
- if (VectorInfo[i] & PVF_FIRST) {
- assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
- VecIdx = i;
- }
- LoadVTs.push_back(EltType);
- if (VectorInfo[i] & PVF_LAST) {
- unsigned NumElts = LoadVTs.size();
- LoadVTs.push_back(MVT::Other);
- LoadVTs.push_back(MVT::Glue);
- NVPTXISD::NodeType Op;
- switch (NumElts) {
- case 1:
- Op = NVPTXISD::LoadParam;
- break;
- case 2:
- Op = NVPTXISD::LoadParamV2;
- break;
- case 4:
- Op = NVPTXISD::LoadParamV4;
- break;
- default:
- llvm_unreachable("Invalid vector info.");
- }
- SDValue LoadOperands[] = {
- Chain, DAG.getConstant(1, dl, MVT::i32),
- DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
- SDValue RetVal = DAG.getMemIntrinsicNode(
- Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
- MachinePointerInfo(), EltAlign,
- MachineMemOperand::MOLoad);
- for (unsigned j = 0; j < NumElts; ++j) {
- ProxyRegOps.push_back(RetVal.getValue(j));
- if (needTruncate)
- ProxyRegTruncates.push_back(Optional<MVT>(Ins[VecIdx + j].VT));
- else
- ProxyRegTruncates.push_back(Optional<MVT>());
- }
- Chain = RetVal.getValue(NumElts);
- InFlag = RetVal.getValue(NumElts + 1);
- // Cleanup
- VecIdx = -1;
- LoadVTs.clear();
- }
- }
- }
- Chain = DAG.getCALLSEQ_END(
- Chain, DAG.getIntPtrConstant(UniqueCallSite, dl, true),
- DAG.getIntPtrConstant(UniqueCallSite + 1, dl, true), InFlag, dl);
- InFlag = Chain.getValue(1);
- // Append ProxyReg instructions to the chain to make sure that `callseq_end`
- // will not get lost. Otherwise, during libcalls expansion, the nodes can become
- // dangling.
- for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
- SDValue Ret = DAG.getNode(
- NVPTXISD::ProxyReg, dl,
- DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
- { Chain, ProxyRegOps[i], InFlag }
- );
- Chain = Ret.getValue(1);
- InFlag = Ret.getValue(2);
- if (ProxyRegTruncates[i].hasValue()) {
- Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].getValue(), Ret);
- }
- InVals.push_back(Ret);
- }
- // set isTailCall to false for now, until we figure out how to express
- // tail call optimization in PTX
- isTailCall = false;
- return Chain;
- }
- // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
- // (see LegalizeDAG.cpp). This is slow and uses local memory.
- // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
- SDValue
- NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
- SDNode *Node = Op.getNode();
- SDLoc dl(Node);
- SmallVector<SDValue, 8> Ops;
- unsigned NumOperands = Node->getNumOperands();
- for (unsigned i = 0; i < NumOperands; ++i) {
- SDValue SubOp = Node->getOperand(i);
- EVT VVT = SubOp.getNode()->getValueType(0);
- EVT EltVT = VVT.getVectorElementType();
- unsigned NumSubElem = VVT.getVectorNumElements();
- for (unsigned j = 0; j < NumSubElem; ++j) {
- Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
- DAG.getIntPtrConstant(j, dl)));
- }
- }
- return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
- }
- // We can init constant f16x2 with a single .b32 move. Normally it
- // would get lowered as two constant loads and vector-packing move.
- // mov.b16 %h1, 0x4000;
- // mov.b16 %h2, 0x3C00;
- // mov.b32 %hh2, {%h2, %h1};
- // Instead we want just a constant move:
- // mov.b32 %hh2, 0x40003C00
- //
- // This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
- // generates good SASS in both cases.
- SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
- SelectionDAG &DAG) const {
- //return Op;
- if (!(Op->getValueType(0) == MVT::v2f16 &&
- isa<ConstantFPSDNode>(Op->getOperand(0)) &&
- isa<ConstantFPSDNode>(Op->getOperand(1))))
- return Op;
- APInt E0 =
- cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt();
- APInt E1 =
- cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt();
- SDValue Const =
- DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
- return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const);
- }
- SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
- SelectionDAG &DAG) const {
- SDValue Index = Op->getOperand(1);
- // Constant index will be matched by tablegen.
- if (isa<ConstantSDNode>(Index.getNode()))
- return Op;
- // Extract individual elements and select one of them.
- SDValue Vector = Op->getOperand(0);
- EVT VectorVT = Vector.getValueType();
- assert(VectorVT == MVT::v2f16 && "Unexpected vector type.");
- EVT EltVT = VectorVT.getVectorElementType();
- SDLoc dl(Op.getNode());
- SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
- DAG.getIntPtrConstant(0, dl));
- SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
- DAG.getIntPtrConstant(1, dl));
- return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
- ISD::CondCode::SETEQ);
- }
- /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
- /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
- /// amount, or
- /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
- /// amount.
- SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
- SelectionDAG &DAG) const {
- assert(Op.getNumOperands() == 3 && "Not a double-shift!");
- assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
- EVT VT = Op.getValueType();
- unsigned VTBits = VT.getSizeInBits();
- SDLoc dl(Op);
- SDValue ShOpLo = Op.getOperand(0);
- SDValue ShOpHi = Op.getOperand(1);
- SDValue ShAmt = Op.getOperand(2);
- unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
- if (VTBits == 32 && STI.getSmVersion() >= 35) {
- // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
- // {dHi, dLo} = {aHi, aLo} >> Amt
- // dHi = aHi >> Amt
- // dLo = shf.r.clamp aLo, aHi, Amt
- SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
- SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
- ShAmt);
- SDValue Ops[2] = { Lo, Hi };
- return DAG.getMergeValues(Ops, dl);
- }
- else {
- // {dHi, dLo} = {aHi, aLo} >> Amt
- // - if (Amt>=size) then
- // dLo = aHi >> (Amt-size)
- // dHi = aHi >> Amt (this is either all 0 or all 1)
- // else
- // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
- // dHi = aHi >> Amt
- SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
- DAG.getConstant(VTBits, dl, MVT::i32),
- ShAmt);
- SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
- SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
- DAG.getConstant(VTBits, dl, MVT::i32));
- SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
- SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
- SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
- SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
- DAG.getConstant(VTBits, dl, MVT::i32),
- ISD::SETGE);
- SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
- SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
- SDValue Ops[2] = { Lo, Hi };
- return DAG.getMergeValues(Ops, dl);
- }
- }
- /// LowerShiftLeftParts - Lower SHL_PARTS, which
- /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
- /// amount, or
- /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
- /// amount.
- SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
- SelectionDAG &DAG) const {
- assert(Op.getNumOperands() == 3 && "Not a double-shift!");
- assert(Op.getOpcode() == ISD::SHL_PARTS);
- EVT VT = Op.getValueType();
- unsigned VTBits = VT.getSizeInBits();
- SDLoc dl(Op);
- SDValue ShOpLo = Op.getOperand(0);
- SDValue ShOpHi = Op.getOperand(1);
- SDValue ShAmt = Op.getOperand(2);
- if (VTBits == 32 && STI.getSmVersion() >= 35) {
- // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
- // {dHi, dLo} = {aHi, aLo} << Amt
- // dHi = shf.l.clamp aLo, aHi, Amt
- // dLo = aLo << Amt
- SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
- ShAmt);
- SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
- SDValue Ops[2] = { Lo, Hi };
- return DAG.getMergeValues(Ops, dl);
- }
- else {
- // {dHi, dLo} = {aHi, aLo} << Amt
- // - if (Amt>=size) then
- // dLo = aLo << Amt (all 0)
- // dLo = aLo << (Amt-size)
- // else
- // dLo = aLo << Amt
- // dHi = (aHi << Amt) | (aLo >> (size-Amt))
- SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
- DAG.getConstant(VTBits, dl, MVT::i32),
- ShAmt);
- SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
- SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
- DAG.getConstant(VTBits, dl, MVT::i32));
- SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
- SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
- SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
- SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
- DAG.getConstant(VTBits, dl, MVT::i32),
- ISD::SETGE);
- SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
- SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
- SDValue Ops[2] = { Lo, Hi };
- return DAG.getMergeValues(Ops, dl);
- }
- }
- SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- if (VT == MVT::f32)
- return LowerFROUND32(Op, DAG);
- if (VT == MVT::f64)
- return LowerFROUND64(Op, DAG);
- llvm_unreachable("unhandled type");
- }
- // This is the the rounding method used in CUDA libdevice in C like code:
- // float roundf(float A)
- // {
- // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
- // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
- // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
- // }
- SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
- SelectionDAG &DAG) const {
- SDLoc SL(Op);
- SDValue A = Op.getOperand(0);
- EVT VT = Op.getValueType();
- SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
- // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
- SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
- const int SignBitMask = 0x80000000;
- SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
- DAG.getConstant(SignBitMask, SL, MVT::i32));
- const int PointFiveInBits = 0x3F000000;
- SDValue PointFiveWithSignRaw =
- DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
- DAG.getConstant(PointFiveInBits, SL, MVT::i32));
- SDValue PointFiveWithSign =
- DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
- SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
- SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
- // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
- EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
- SDValue IsLarge =
- DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
- ISD::SETOGT);
- RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
- // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
- SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
- DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
- SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
- return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
- }
- // The implementation of round(double) is similar to that of round(float) in
- // that they both separate the value range into three regions and use a method
- // specific to the region to round the values. However, round(double) first
- // calculates the round of the absolute value and then adds the sign back while
- // round(float) directly rounds the value with sign.
- SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
- SelectionDAG &DAG) const {
- SDLoc SL(Op);
- SDValue A = Op.getOperand(0);
- EVT VT = Op.getValueType();
- SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
- // double RoundedA = (double) (int) (abs(A) + 0.5f);
- SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
- DAG.getConstantFP(0.5, SL, VT));
- SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
- // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
- EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
- SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
- DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
- RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
- DAG.getConstantFP(0, SL, VT),
- RoundedA);
- // Add sign to rounded_A
- RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
- DAG.getNode(ISD::FTRUNC, SL, VT, A);
- // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
- SDValue IsLarge =
- DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
- ISD::SETOGT);
- return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
- }
- SDValue
- NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
- switch (Op.getOpcode()) {
- case ISD::RETURNADDR:
- return SDValue();
- case ISD::FRAMEADDR:
- return SDValue();
- case ISD::GlobalAddress:
- return LowerGlobalAddress(Op, DAG);
- case ISD::INTRINSIC_W_CHAIN:
- return Op;
- case ISD::BUILD_VECTOR:
- return LowerBUILD_VECTOR(Op, DAG);
- case ISD::EXTRACT_SUBVECTOR:
- return Op;
- case ISD::EXTRACT_VECTOR_ELT:
- return LowerEXTRACT_VECTOR_ELT(Op, DAG);
- case ISD::CONCAT_VECTORS:
- return LowerCONCAT_VECTORS(Op, DAG);
- case ISD::STORE:
- return LowerSTORE(Op, DAG);
- case ISD::LOAD:
- return LowerLOAD(Op, DAG);
- case ISD::SHL_PARTS:
- return LowerShiftLeftParts(Op, DAG);
- case ISD::SRA_PARTS:
- case ISD::SRL_PARTS:
- return LowerShiftRightParts(Op, DAG);
- case ISD::SELECT:
- return LowerSelect(Op, DAG);
- case ISD::FROUND:
- return LowerFROUND(Op, DAG);
- default:
- llvm_unreachable("Custom lowering not defined for operation");
- }
- }
- SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
- SDValue Op0 = Op->getOperand(0);
- SDValue Op1 = Op->getOperand(1);
- SDValue Op2 = Op->getOperand(2);
- SDLoc DL(Op.getNode());
- assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
- Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
- Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
- SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
- SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
- return Trunc;
- }
- SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
- if (Op.getValueType() == MVT::i1)
- return LowerLOADi1(Op, DAG);
- // v2f16 is legal, so we can't rely on legalizer to handle unaligned
- // loads and have to handle it here.
- if (Op.getValueType() == MVT::v2f16) {
- LoadSDNode *Load = cast<LoadSDNode>(Op);
- EVT MemVT = Load->getMemoryVT();
- if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
- MemVT, *Load->getMemOperand())) {
- SDValue Ops[2];
- std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
- return DAG.getMergeValues(Ops, SDLoc(Op));
- }
- }
- return SDValue();
- }
- // v = ld i1* addr
- // =>
- // v1 = ld i8* addr (-> i16)
- // v = trunc i16 to i1
- SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
- SDNode *Node = Op.getNode();
- LoadSDNode *LD = cast<LoadSDNode>(Node);
- SDLoc dl(Node);
- assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
- assert(Node->getValueType(0) == MVT::i1 &&
- "Custom lowering for i1 load only");
- SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
- LD->getPointerInfo(), LD->getAlignment(),
- LD->getMemOperand()->getFlags());
- SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
- // The legalizer (the caller) is expecting two values from the legalized
- // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
- // in LegalizeDAG.cpp which also uses MergeValues.
- SDValue Ops[] = { result, LD->getChain() };
- return DAG.getMergeValues(Ops, dl);
- }
- SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
- StoreSDNode *Store = cast<StoreSDNode>(Op);
- EVT VT = Store->getMemoryVT();
- if (VT == MVT::i1)
- return LowerSTOREi1(Op, DAG);
- // v2f16 is legal, so we can't rely on legalizer to handle unaligned
- // stores and have to handle it here.
- if (VT == MVT::v2f16 &&
- !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
- VT, *Store->getMemOperand()))
- return expandUnalignedStore(Store, DAG);
- if (VT.isVector())
- return LowerSTOREVector(Op, DAG);
- return SDValue();
- }
- SDValue
- NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
- SDNode *N = Op.getNode();
- SDValue Val = N->getOperand(1);
- SDLoc DL(N);
- EVT ValVT = Val.getValueType();
- if (ValVT.isVector()) {
- // We only handle "native" vector sizes for now, e.g. <4 x double> is not
- // legal. We can (and should) split that into 2 stores of <2 x double> here
- // but I'm leaving that as a TODO for now.
- if (!ValVT.isSimple())
- return SDValue();
- switch (ValVT.getSimpleVT().SimpleTy) {
- default:
- return SDValue();
- case MVT::v2i8:
- case MVT::v2i16:
- case MVT::v2i32:
- case MVT::v2i64:
- case MVT::v2f16:
- case MVT::v2f32:
- case MVT::v2f64:
- case MVT::v4i8:
- case MVT::v4i16:
- case MVT::v4i32:
- case MVT::v4f16:
- case MVT::v4f32:
- case MVT::v8f16: // <4 x f16x2>
- // This is a "native" vector type
- break;
- }
- MemSDNode *MemSD = cast<MemSDNode>(N);
- const DataLayout &TD = DAG.getDataLayout();
- Align Alignment = MemSD->getAlign();
- Align PrefAlign =
- TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
- if (Alignment < PrefAlign) {
- // This store is not sufficiently aligned, so bail out and let this vector
- // store be scalarized. Note that we may still be able to emit smaller
- // vector stores. For example, if we are storing a <4 x float> with an
- // alignment of 8, this check will fail but the legalizer will try again
- // with 2 x <2 x float>, which will succeed with an alignment of 8.
- return SDValue();
- }
- unsigned Opcode = 0;
- EVT EltVT = ValVT.getVectorElementType();
- unsigned NumElts = ValVT.getVectorNumElements();
- // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
- // Therefore, we must ensure the type is legal. For i1 and i8, we set the
- // stored type to i16 and propagate the "real" type as the memory type.
- bool NeedExt = false;
- if (EltVT.getSizeInBits() < 16)
- NeedExt = true;
- bool StoreF16x2 = false;
- switch (NumElts) {
- default:
- return SDValue();
- case 2:
- Opcode = NVPTXISD::StoreV2;
- break;
- case 4:
- Opcode = NVPTXISD::StoreV4;
- break;
- case 8:
- // v8f16 is a special case. PTX doesn't have st.v8.f16
- // instruction. Instead, we split the vector into v2f16 chunks and
- // store them with st.v4.b32.
- assert(EltVT == MVT::f16 && "Wrong type for the vector.");
- Opcode = NVPTXISD::StoreV4;
- StoreF16x2 = true;
- break;
- }
- SmallVector<SDValue, 8> Ops;
- // First is the chain
- Ops.push_back(N->getOperand(0));
- if (StoreF16x2) {
- // Combine f16,f16 -> v2f16
- NumElts /= 2;
- for (unsigned i = 0; i < NumElts; ++i) {
- SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
- DAG.getIntPtrConstant(i * 2, DL));
- SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
- DAG.getIntPtrConstant(i * 2 + 1, DL));
- SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1);
- Ops.push_back(V2);
- }
- } else {
- // Then the split values
- for (unsigned i = 0; i < NumElts; ++i) {
- SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
- DAG.getIntPtrConstant(i, DL));
- if (NeedExt)
- ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
- Ops.push_back(ExtVal);
- }
- }
- // Then any remaining arguments
- Ops.append(N->op_begin() + 2, N->op_end());
- SDValue NewSt =
- DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
- MemSD->getMemoryVT(), MemSD->getMemOperand());
- // return DCI.CombineTo(N, NewSt, true);
- return NewSt;
- }
- return SDValue();
- }
- // st i1 v, addr
- // =>
- // v1 = zxt v to i16
- // st.u8 i16, addr
- SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
- SDNode *Node = Op.getNode();
- SDLoc dl(Node);
- StoreSDNode *ST = cast<StoreSDNode>(Node);
- SDValue Tmp1 = ST->getChain();
- SDValue Tmp2 = ST->getBasePtr();
- SDValue Tmp3 = ST->getValue();
- assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
- Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
- SDValue Result =
- DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
- ST->getAlignment(), ST->getMemOperand()->getFlags());
- return Result;
- }
- SDValue
- NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
- std::string ParamSym;
- raw_string_ostream ParamStr(ParamSym);
- ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
- ParamStr.flush();
- std::string *SavedStr =
- nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
- return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
- }
- // Check to see if the kernel argument is image*_t or sampler_t
- static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
- static const char *const specialTypes[] = { "struct._image2d_t",
- "struct._image3d_t",
- "struct._sampler_t" };
- Type *Ty = arg->getType();
- auto *PTy = dyn_cast<PointerType>(Ty);
- if (!PTy)
- return false;
- if (!context)
- return false;
- auto *STy = dyn_cast<StructType>(PTy->getPointerElementType());
- if (!STy || STy->isLiteral())
- return false;
- return llvm::is_contained(specialTypes, STy->getName());
- }
- SDValue NVPTXTargetLowering::LowerFormalArguments(
- SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
- SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
- MachineFunction &MF = DAG.getMachineFunction();
- const DataLayout &DL = DAG.getDataLayout();
- auto PtrVT = getPointerTy(DAG.getDataLayout());
- const Function *F = &MF.getFunction();
- const AttributeList &PAL = F->getAttributes();
- const TargetLowering *TLI = STI.getTargetLowering();
- SDValue Root = DAG.getRoot();
- std::vector<SDValue> OutChains;
- bool isABI = (STI.getSmVersion() >= 20);
- assert(isABI && "Non-ABI compilation is not supported");
- if (!isABI)
- return Chain;
- std::vector<Type *> argTypes;
- std::vector<const Argument *> theArgs;
- for (const Argument &I : F->args()) {
- theArgs.push_back(&I);
- argTypes.push_back(I.getType());
- }
- // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
- // Ins.size() will be larger
- // * if there is an aggregate argument with multiple fields (each field
- // showing up separately in Ins)
- // * if there is a vector argument with more than typical vector-length
- // elements (generally if more than 4) where each vector element is
- // individually present in Ins.
- // So a different index should be used for indexing into Ins.
- // See similar issue in LowerCall.
- unsigned InsIdx = 0;
- int idx = 0;
- for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
- Type *Ty = argTypes[i];
- // If the kernel argument is image*_t or sampler_t, convert it to
- // a i32 constant holding the parameter position. This can later
- // matched in the AsmPrinter to output the correct mangled name.
- if (isImageOrSamplerVal(
- theArgs[i],
- (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
- : nullptr))) {
- assert(isKernelFunction(*F) &&
- "Only kernels can have image/sampler params");
- InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
- continue;
- }
- if (theArgs[i]->use_empty()) {
- // argument is dead
- if (Ty->isAggregateType() || Ty->isIntegerTy(128)) {
- SmallVector<EVT, 16> vtparts;
- ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
- assert(vtparts.size() > 0 && "empty aggregate type not expected");
- for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
- ++parti) {
- InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
- ++InsIdx;
- }
- if (vtparts.size() > 0)
- --InsIdx;
- continue;
- }
- if (Ty->isVectorTy()) {
- EVT ObjectVT = getValueType(DL, Ty);
- unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
- for (unsigned parti = 0; parti < NumRegs; ++parti) {
- InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
- ++InsIdx;
- }
- if (NumRegs > 0)
- --InsIdx;
- continue;
- }
- InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
- continue;
- }
- // In the following cases, assign a node order of "idx+1"
- // to newly created nodes. The SDNodes for params have to
- // appear in the same order as their order of appearance
- // in the original function. "idx+1" holds that order.
- if (!PAL.hasParamAttr(i, Attribute::ByVal)) {
- bool aggregateIsPacked = false;
- if (StructType *STy = dyn_cast<StructType>(Ty))
- aggregateIsPacked = STy->isPacked();
- SmallVector<EVT, 16> VTs;
- SmallVector<uint64_t, 16> Offsets;
- ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
- assert(VTs.size() > 0 && "Unexpected empty type.");
- auto VectorInfo =
- VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty));
- SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
- int VecIdx = -1; // Index of the first element of the current vector.
- for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
- if (VectorInfo[parti] & PVF_FIRST) {
- assert(VecIdx == -1 && "Orphaned vector.");
- VecIdx = parti;
- }
- // That's the last element of this store op.
- if (VectorInfo[parti] & PVF_LAST) {
- unsigned NumElts = parti - VecIdx + 1;
- EVT EltVT = VTs[parti];
- // i1 is loaded/stored as i8.
- EVT LoadVT = EltVT;
- if (EltVT == MVT::i1)
- LoadVT = MVT::i8;
- else if (EltVT == MVT::v2f16)
- // getLoad needs a vector type, but it can't handle
- // vectors which contain v2f16 elements. So we must load
- // using i32 here and then bitcast back.
- LoadVT = MVT::i32;
- EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
- SDValue VecAddr =
- DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
- DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
- Value *srcValue = Constant::getNullValue(PointerType::get(
- EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
- SDValue P =
- DAG.getLoad(VecVT, dl, Root, VecAddr,
- MachinePointerInfo(srcValue), aggregateIsPacked,
- MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOInvariant);
- if (P.getNode())
- P.getNode()->setIROrder(idx + 1);
- for (unsigned j = 0; j < NumElts; ++j) {
- SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
- DAG.getIntPtrConstant(j, dl));
- // We've loaded i1 as an i8 and now must truncate it back to i1
- if (EltVT == MVT::i1)
- Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
- // v2f16 was loaded as an i32. Now we must bitcast it back.
- else if (EltVT == MVT::v2f16)
- Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
- // Extend the element if necessary (e.g. an i8 is loaded
- // into an i16 register)
- if (Ins[InsIdx].VT.isInteger() &&
- Ins[InsIdx].VT.getFixedSizeInBits() >
- LoadVT.getFixedSizeInBits()) {
- unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
- : ISD::ZERO_EXTEND;
- Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
- }
- InVals.push_back(Elt);
- }
- // Reset vector tracking state.
- VecIdx = -1;
- }
- ++InsIdx;
- }
- if (VTs.size() > 0)
- --InsIdx;
- continue;
- }
- // Param has ByVal attribute
- // Return MoveParam(param symbol).
- // Ideally, the param symbol can be returned directly,
- // but when SDNode builder decides to use it in a CopyToReg(),
- // machine instruction fails because TargetExternalSymbol
- // (not lowered) is target dependent, and CopyToReg assumes
- // the source is lowered.
- EVT ObjectVT = getValueType(DL, Ty);
- assert(ObjectVT == Ins[InsIdx].VT &&
- "Ins type did not match function type");
- SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
- SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
- if (p.getNode())
- p.getNode()->setIROrder(idx + 1);
- InVals.push_back(p);
- }
- // Clang will check explicit VarArg and issue error if any. However, Clang
- // will let code with
- // implicit var arg like f() pass. See bug 617733.
- // We treat this case as if the arg list is empty.
- // if (F.isVarArg()) {
- // assert(0 && "VarArg not supported yet!");
- //}
- if (!OutChains.empty())
- DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
- return Chain;
- }
- SDValue
- NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
- bool isVarArg,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- const SDLoc &dl, SelectionDAG &DAG) const {
- MachineFunction &MF = DAG.getMachineFunction();
- Type *RetTy = MF.getFunction().getReturnType();
- bool isABI = (STI.getSmVersion() >= 20);
- assert(isABI && "Non-ABI compilation is not supported");
- if (!isABI)
- return Chain;
- const DataLayout &DL = DAG.getDataLayout();
- SmallVector<EVT, 16> VTs;
- SmallVector<uint64_t, 16> Offsets;
- ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
- assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
- auto VectorInfo = VectorizePTXValueVTs(
- VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlign(RetTy) : Align(1));
- // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
- // 32-bits are sign extended or zero extended, depending on whether
- // they are signed or unsigned types.
- bool ExtendIntegerRetVal =
- RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
- SmallVector<SDValue, 6> StoreOperands;
- for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
- // New load/store. Record chain and offset operands.
- if (VectorInfo[i] & PVF_FIRST) {
- assert(StoreOperands.empty() && "Orphaned operand list.");
- StoreOperands.push_back(Chain);
- StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
- }
- SDValue RetVal = OutVals[i];
- if (ExtendIntegerRetVal) {
- RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
- : ISD::ZERO_EXTEND,
- dl, MVT::i32, RetVal);
- } else if (RetVal.getValueSizeInBits() < 16) {
- // Use 16-bit registers for small load-stores as it's the
- // smallest general purpose register size supported by NVPTX.
- RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
- }
- // Record the value to return.
- StoreOperands.push_back(RetVal);
- // That's the last element of this store op.
- if (VectorInfo[i] & PVF_LAST) {
- NVPTXISD::NodeType Op;
- unsigned NumElts = StoreOperands.size() - 2;
- switch (NumElts) {
- case 1:
- Op = NVPTXISD::StoreRetval;
- break;
- case 2:
- Op = NVPTXISD::StoreRetvalV2;
- break;
- case 4:
- Op = NVPTXISD::StoreRetvalV4;
- break;
- default:
- llvm_unreachable("Invalid vector info.");
- }
- // Adjust type of load/store op if we've extended the scalar
- // return value.
- EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
- Chain = DAG.getMemIntrinsicNode(
- Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,
- MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
- // Cleanup vector state.
- StoreOperands.clear();
- }
- }
- return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
- }
- void NVPTXTargetLowering::LowerAsmOperandForConstraint(
- SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
- SelectionDAG &DAG) const {
- if (Constraint.length() > 1)
- return;
- else
- TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
- }
- static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
- switch (Intrinsic) {
- default:
- return 0;
- case Intrinsic::nvvm_tex_1d_v4f32_s32:
- return NVPTXISD::Tex1DFloatS32;
- case Intrinsic::nvvm_tex_1d_v4f32_f32:
- return NVPTXISD::Tex1DFloatFloat;
- case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
- return NVPTXISD::Tex1DFloatFloatLevel;
- case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
- return NVPTXISD::Tex1DFloatFloatGrad;
- case Intrinsic::nvvm_tex_1d_v4s32_s32:
- return NVPTXISD::Tex1DS32S32;
- case Intrinsic::nvvm_tex_1d_v4s32_f32:
- return NVPTXISD::Tex1DS32Float;
- case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
- return NVPTXISD::Tex1DS32FloatLevel;
- case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
- return NVPTXISD::Tex1DS32FloatGrad;
- case Intrinsic::nvvm_tex_1d_v4u32_s32:
- return NVPTXISD::Tex1DU32S32;
- case Intrinsic::nvvm_tex_1d_v4u32_f32:
- return NVPTXISD::Tex1DU32Float;
- case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
- return NVPTXISD::Tex1DU32FloatLevel;
- case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
- return NVPTXISD::Tex1DU32FloatGrad;
- case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
- return NVPTXISD::Tex1DArrayFloatS32;
- case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
- return NVPTXISD::Tex1DArrayFloatFloat;
- case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
- return NVPTXISD::Tex1DArrayFloatFloatLevel;
- case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
- return NVPTXISD::Tex1DArrayFloatFloatGrad;
- case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
- return NVPTXISD::Tex1DArrayS32S32;
- case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
- return NVPTXISD::Tex1DArrayS32Float;
- case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
- return NVPTXISD::Tex1DArrayS32FloatLevel;
- case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
- return NVPTXISD::Tex1DArrayS32FloatGrad;
- case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
- return NVPTXISD::Tex1DArrayU32S32;
- case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
- return NVPTXISD::Tex1DArrayU32Float;
- case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
- return NVPTXISD::Tex1DArrayU32FloatLevel;
- case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
- return NVPTXISD::Tex1DArrayU32FloatGrad;
- case Intrinsic::nvvm_tex_2d_v4f32_s32:
- return NVPTXISD::Tex2DFloatS32;
- case Intrinsic::nvvm_tex_2d_v4f32_f32:
- return NVPTXISD::Tex2DFloatFloat;
- case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
- return NVPTXISD::Tex2DFloatFloatLevel;
- case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
- return NVPTXISD::Tex2DFloatFloatGrad;
- case Intrinsic::nvvm_tex_2d_v4s32_s32:
- return NVPTXISD::Tex2DS32S32;
- case Intrinsic::nvvm_tex_2d_v4s32_f32:
- return NVPTXISD::Tex2DS32Float;
- case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
- return NVPTXISD::Tex2DS32FloatLevel;
- case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
- return NVPTXISD::Tex2DS32FloatGrad;
- case Intrinsic::nvvm_tex_2d_v4u32_s32:
- return NVPTXISD::Tex2DU32S32;
- case Intrinsic::nvvm_tex_2d_v4u32_f32:
- return NVPTXISD::Tex2DU32Float;
- case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
- return NVPTXISD::Tex2DU32FloatLevel;
- case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
- return NVPTXISD::Tex2DU32FloatGrad;
- case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
- return NVPTXISD::Tex2DArrayFloatS32;
- case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
- return NVPTXISD::Tex2DArrayFloatFloat;
- case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
- return NVPTXISD::Tex2DArrayFloatFloatLevel;
- case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
- return NVPTXISD::Tex2DArrayFloatFloatGrad;
- case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
- return NVPTXISD::Tex2DArrayS32S32;
- case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
- return NVPTXISD::Tex2DArrayS32Float;
- case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
- return NVPTXISD::Tex2DArrayS32FloatLevel;
- case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
- return NVPTXISD::Tex2DArrayS32FloatGrad;
- case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
- return NVPTXISD::Tex2DArrayU32S32;
- case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
- return NVPTXISD::Tex2DArrayU32Float;
- case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
- return NVPTXISD::Tex2DArrayU32FloatLevel;
- case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
- return NVPTXISD::Tex2DArrayU32FloatGrad;
- case Intrinsic::nvvm_tex_3d_v4f32_s32:
- return NVPTXISD::Tex3DFloatS32;
- case Intrinsic::nvvm_tex_3d_v4f32_f32:
- return NVPTXISD::Tex3DFloatFloat;
- case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
- return NVPTXISD::Tex3DFloatFloatLevel;
- case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
- return NVPTXISD::Tex3DFloatFloatGrad;
- case Intrinsic::nvvm_tex_3d_v4s32_s32:
- return NVPTXISD::Tex3DS32S32;
- case Intrinsic::nvvm_tex_3d_v4s32_f32:
- return NVPTXISD::Tex3DS32Float;
- case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
- return NVPTXISD::Tex3DS32FloatLevel;
- case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
- return NVPTXISD::Tex3DS32FloatGrad;
- case Intrinsic::nvvm_tex_3d_v4u32_s32:
- return NVPTXISD::Tex3DU32S32;
- case Intrinsic::nvvm_tex_3d_v4u32_f32:
- return NVPTXISD::Tex3DU32Float;
- case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
- return NVPTXISD::Tex3DU32FloatLevel;
- case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
- return NVPTXISD::Tex3DU32FloatGrad;
- case Intrinsic::nvvm_tex_cube_v4f32_f32:
- return NVPTXISD::TexCubeFloatFloat;
- case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
- return NVPTXISD::TexCubeFloatFloatLevel;
- case Intrinsic::nvvm_tex_cube_v4s32_f32:
- return NVPTXISD::TexCubeS32Float;
- case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
- return NVPTXISD::TexCubeS32FloatLevel;
- case Intrinsic::nvvm_tex_cube_v4u32_f32:
- return NVPTXISD::TexCubeU32Float;
- case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
- return NVPTXISD::TexCubeU32FloatLevel;
- case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
- return NVPTXISD::TexCubeArrayFloatFloat;
- case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
- return NVPTXISD::TexCubeArrayFloatFloatLevel;
- case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
- return NVPTXISD::TexCubeArrayS32Float;
- case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
- return NVPTXISD::TexCubeArrayS32FloatLevel;
- case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
- return NVPTXISD::TexCubeArrayU32Float;
- case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
- return NVPTXISD::TexCubeArrayU32FloatLevel;
- case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
- return NVPTXISD::Tld4R2DFloatFloat;
- case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
- return NVPTXISD::Tld4G2DFloatFloat;
- case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
- return NVPTXISD::Tld4B2DFloatFloat;
- case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
- return NVPTXISD::Tld4A2DFloatFloat;
- case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
- return NVPTXISD::Tld4R2DS64Float;
- case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
- return NVPTXISD::Tld4G2DS64Float;
- case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
- return NVPTXISD::Tld4B2DS64Float;
- case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
- return NVPTXISD::Tld4A2DS64Float;
- case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
- return NVPTXISD::Tld4R2DU64Float;
- case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
- return NVPTXISD::Tld4G2DU64Float;
- case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
- return NVPTXISD::Tld4B2DU64Float;
- case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
- return NVPTXISD::Tld4A2DU64Float;
- case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
- return NVPTXISD::TexUnified1DFloatS32;
- case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
- return NVPTXISD::TexUnified1DFloatFloat;
- case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
- return NVPTXISD::TexUnified1DFloatFloatLevel;
- case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
- return NVPTXISD::TexUnified1DFloatFloatGrad;
- case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
- return NVPTXISD::TexUnified1DS32S32;
- case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
- return NVPTXISD::TexUnified1DS32Float;
- case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
- return NVPTXISD::TexUnified1DS32FloatLevel;
- case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
- return NVPTXISD::TexUnified1DS32FloatGrad;
- case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
- return NVPTXISD::TexUnified1DU32S32;
- case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
- return NVPTXISD::TexUnified1DU32Float;
- case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
- return NVPTXISD::TexUnified1DU32FloatLevel;
- case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
- return NVPTXISD::TexUnified1DU32FloatGrad;
- case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
- return NVPTXISD::TexUnified1DArrayFloatS32;
- case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
- return NVPTXISD::TexUnified1DArrayFloatFloat;
- case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
- return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
- case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
- return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
- case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
- return NVPTXISD::TexUnified1DArrayS32S32;
- case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
- return NVPTXISD::TexUnified1DArrayS32Float;
- case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
- return NVPTXISD::TexUnified1DArrayS32FloatLevel;
- case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
- return NVPTXISD::TexUnified1DArrayS32FloatGrad;
- case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
- return NVPTXISD::TexUnified1DArrayU32S32;
- case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
- return NVPTXISD::TexUnified1DArrayU32Float;
- case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
- return NVPTXISD::TexUnified1DArrayU32FloatLevel;
- case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
- return NVPTXISD::TexUnified1DArrayU32FloatGrad;
- case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
- return NVPTXISD::TexUnified2DFloatS32;
- case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
- return NVPTXISD::TexUnified2DFloatFloat;
- case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
- return NVPTXISD::TexUnified2DFloatFloatLevel;
- case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
- return NVPTXISD::TexUnified2DFloatFloatGrad;
- case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
- return NVPTXISD::TexUnified2DS32S32;
- case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
- return NVPTXISD::TexUnified2DS32Float;
- case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
- return NVPTXISD::TexUnified2DS32FloatLevel;
- case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
- return NVPTXISD::TexUnified2DS32FloatGrad;
- case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
- return NVPTXISD::TexUnified2DU32S32;
- case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
- return NVPTXISD::TexUnified2DU32Float;
- case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
- return NVPTXISD::TexUnified2DU32FloatLevel;
- case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
- return NVPTXISD::TexUnified2DU32FloatGrad;
- case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
- return NVPTXISD::TexUnified2DArrayFloatS32;
- case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
- return NVPTXISD::TexUnified2DArrayFloatFloat;
- case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
- return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
- case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
- return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
- case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
- return NVPTXISD::TexUnified2DArrayS32S32;
- case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
- return NVPTXISD::TexUnified2DArrayS32Float;
- case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
- return NVPTXISD::TexUnified2DArrayS32FloatLevel;
- case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
- return NVPTXISD::TexUnified2DArrayS32FloatGrad;
- case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
- return NVPTXISD::TexUnified2DArrayU32S32;
- case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
- return NVPTXISD::TexUnified2DArrayU32Float;
- case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
- return NVPTXISD::TexUnified2DArrayU32FloatLevel;
- case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
- return NVPTXISD::TexUnified2DArrayU32FloatGrad;
- case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
- return NVPTXISD::TexUnified3DFloatS32;
- case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
- return NVPTXISD::TexUnified3DFloatFloat;
- case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
- return NVPTXISD::TexUnified3DFloatFloatLevel;
- case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
- return NVPTXISD::TexUnified3DFloatFloatGrad;
- case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
- return NVPTXISD::TexUnified3DS32S32;
- case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
- return NVPTXISD::TexUnified3DS32Float;
- case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
- return NVPTXISD::TexUnified3DS32FloatLevel;
- case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
- return NVPTXISD::TexUnified3DS32FloatGrad;
- case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
- return NVPTXISD::TexUnified3DU32S32;
- case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
- return NVPTXISD::TexUnified3DU32Float;
- case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
- return NVPTXISD::TexUnified3DU32FloatLevel;
- case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
- return NVPTXISD::TexUnified3DU32FloatGrad;
- case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
- return NVPTXISD::TexUnifiedCubeFloatFloat;
- case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
- return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
- case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
- return NVPTXISD::TexUnifiedCubeS32Float;
- case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
- return NVPTXISD::TexUnifiedCubeS32FloatLevel;
- case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
- return NVPTXISD::TexUnifiedCubeU32Float;
- case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
- return NVPTXISD::TexUnifiedCubeU32FloatLevel;
- case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
- return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
- case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
- return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
- case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
- return NVPTXISD::TexUnifiedCubeArrayS32Float;
- case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
- return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
- case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
- return NVPTXISD::TexUnifiedCubeArrayU32Float;
- case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
- return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
- case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
- return NVPTXISD::Tld4UnifiedR2DFloatFloat;
- case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
- return NVPTXISD::Tld4UnifiedG2DFloatFloat;
- case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
- return NVPTXISD::Tld4UnifiedB2DFloatFloat;
- case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
- return NVPTXISD::Tld4UnifiedA2DFloatFloat;
- case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
- return NVPTXISD::Tld4UnifiedR2DS64Float;
- case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
- return NVPTXISD::Tld4UnifiedG2DS64Float;
- case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
- return NVPTXISD::Tld4UnifiedB2DS64Float;
- case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
- return NVPTXISD::Tld4UnifiedA2DS64Float;
- case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
- return NVPTXISD::Tld4UnifiedR2DU64Float;
- case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
- return NVPTXISD::Tld4UnifiedG2DU64Float;
- case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
- return NVPTXISD::Tld4UnifiedB2DU64Float;
- case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
- return NVPTXISD::Tld4UnifiedA2DU64Float;
- }
- }
- static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
- switch (Intrinsic) {
- default:
- return 0;
- case Intrinsic::nvvm_suld_1d_i8_clamp:
- return NVPTXISD::Suld1DI8Clamp;
- case Intrinsic::nvvm_suld_1d_i16_clamp:
- return NVPTXISD::Suld1DI16Clamp;
- case Intrinsic::nvvm_suld_1d_i32_clamp:
- return NVPTXISD::Suld1DI32Clamp;
- case Intrinsic::nvvm_suld_1d_i64_clamp:
- return NVPTXISD::Suld1DI64Clamp;
- case Intrinsic::nvvm_suld_1d_v2i8_clamp:
- return NVPTXISD::Suld1DV2I8Clamp;
- case Intrinsic::nvvm_suld_1d_v2i16_clamp:
- return NVPTXISD::Suld1DV2I16Clamp;
- case Intrinsic::nvvm_suld_1d_v2i32_clamp:
- return NVPTXISD::Suld1DV2I32Clamp;
- case Intrinsic::nvvm_suld_1d_v2i64_clamp:
- return NVPTXISD::Suld1DV2I64Clamp;
- case Intrinsic::nvvm_suld_1d_v4i8_clamp:
- return NVPTXISD::Suld1DV4I8Clamp;
- case Intrinsic::nvvm_suld_1d_v4i16_clamp:
- return NVPTXISD::Suld1DV4I16Clamp;
- case Intrinsic::nvvm_suld_1d_v4i32_clamp:
- return NVPTXISD::Suld1DV4I32Clamp;
- case Intrinsic::nvvm_suld_1d_array_i8_clamp:
- return NVPTXISD::Suld1DArrayI8Clamp;
- case Intrinsic::nvvm_suld_1d_array_i16_clamp:
- return NVPTXISD::Suld1DArrayI16Clamp;
- case Intrinsic::nvvm_suld_1d_array_i32_clamp:
- return NVPTXISD::Suld1DArrayI32Clamp;
- case Intrinsic::nvvm_suld_1d_array_i64_clamp:
- return NVPTXISD::Suld1DArrayI64Clamp;
- case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
- return NVPTXISD::Suld1DArrayV2I8Clamp;
- case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
- return NVPTXISD::Suld1DArrayV2I16Clamp;
- case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
- return NVPTXISD::Suld1DArrayV2I32Clamp;
- case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
- return NVPTXISD::Suld1DArrayV2I64Clamp;
- case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
- return NVPTXISD::Suld1DArrayV4I8Clamp;
- case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
- return NVPTXISD::Suld1DArrayV4I16Clamp;
- case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
- return NVPTXISD::Suld1DArrayV4I32Clamp;
- case Intrinsic::nvvm_suld_2d_i8_clamp:
- return NVPTXISD::Suld2DI8Clamp;
- case Intrinsic::nvvm_suld_2d_i16_clamp:
- return NVPTXISD::Suld2DI16Clamp;
- case Intrinsic::nvvm_suld_2d_i32_clamp:
- return NVPTXISD::Suld2DI32Clamp;
- case Intrinsic::nvvm_suld_2d_i64_clamp:
- return NVPTXISD::Suld2DI64Clamp;
- case Intrinsic::nvvm_suld_2d_v2i8_clamp:
- return NVPTXISD::Suld2DV2I8Clamp;
- case Intrinsic::nvvm_suld_2d_v2i16_clamp:
- return NVPTXISD::Suld2DV2I16Clamp;
- case Intrinsic::nvvm_suld_2d_v2i32_clamp:
- return NVPTXISD::Suld2DV2I32Clamp;
- case Intrinsic::nvvm_suld_2d_v2i64_clamp:
- return NVPTXISD::Suld2DV2I64Clamp;
- case Intrinsic::nvvm_suld_2d_v4i8_clamp:
- return NVPTXISD::Suld2DV4I8Clamp;
- case Intrinsic::nvvm_suld_2d_v4i16_clamp:
- return NVPTXISD::Suld2DV4I16Clamp;
- case Intrinsic::nvvm_suld_2d_v4i32_clamp:
- return NVPTXISD::Suld2DV4I32Clamp;
- case Intrinsic::nvvm_suld_2d_array_i8_clamp:
- return NVPTXISD::Suld2DArrayI8Clamp;
- case Intrinsic::nvvm_suld_2d_array_i16_clamp:
- return NVPTXISD::Suld2DArrayI16Clamp;
- case Intrinsic::nvvm_suld_2d_array_i32_clamp:
- return NVPTXISD::Suld2DArrayI32Clamp;
- case Intrinsic::nvvm_suld_2d_array_i64_clamp:
- return NVPTXISD::Suld2DArrayI64Clamp;
- case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
- return NVPTXISD::Suld2DArrayV2I8Clamp;
- case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
- return NVPTXISD::Suld2DArrayV2I16Clamp;
- case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
- return NVPTXISD::Suld2DArrayV2I32Clamp;
- case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
- return NVPTXISD::Suld2DArrayV2I64Clamp;
- case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
- return NVPTXISD::Suld2DArrayV4I8Clamp;
- case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
- return NVPTXISD::Suld2DArrayV4I16Clamp;
- case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
- return NVPTXISD::Suld2DArrayV4I32Clamp;
- case Intrinsic::nvvm_suld_3d_i8_clamp:
- return NVPTXISD::Suld3DI8Clamp;
- case Intrinsic::nvvm_suld_3d_i16_clamp:
- return NVPTXISD::Suld3DI16Clamp;
- case Intrinsic::nvvm_suld_3d_i32_clamp:
- return NVPTXISD::Suld3DI32Clamp;
- case Intrinsic::nvvm_suld_3d_i64_clamp:
- return NVPTXISD::Suld3DI64Clamp;
- case Intrinsic::nvvm_suld_3d_v2i8_clamp:
- return NVPTXISD::Suld3DV2I8Clamp;
- case Intrinsic::nvvm_suld_3d_v2i16_clamp:
- return NVPTXISD::Suld3DV2I16Clamp;
- case Intrinsic::nvvm_suld_3d_v2i32_clamp:
- return NVPTXISD::Suld3DV2I32Clamp;
- case Intrinsic::nvvm_suld_3d_v2i64_clamp:
- return NVPTXISD::Suld3DV2I64Clamp;
- case Intrinsic::nvvm_suld_3d_v4i8_clamp:
- return NVPTXISD::Suld3DV4I8Clamp;
- case Intrinsic::nvvm_suld_3d_v4i16_clamp:
- return NVPTXISD::Suld3DV4I16Clamp;
- case Intrinsic::nvvm_suld_3d_v4i32_clamp:
- return NVPTXISD::Suld3DV4I32Clamp;
- case Intrinsic::nvvm_suld_1d_i8_trap:
- return NVPTXISD::Suld1DI8Trap;
- case Intrinsic::nvvm_suld_1d_i16_trap:
- return NVPTXISD::Suld1DI16Trap;
- case Intrinsic::nvvm_suld_1d_i32_trap:
- return NVPTXISD::Suld1DI32Trap;
- case Intrinsic::nvvm_suld_1d_i64_trap:
- return NVPTXISD::Suld1DI64Trap;
- case Intrinsic::nvvm_suld_1d_v2i8_trap:
- return NVPTXISD::Suld1DV2I8Trap;
- case Intrinsic::nvvm_suld_1d_v2i16_trap:
- return NVPTXISD::Suld1DV2I16Trap;
- case Intrinsic::nvvm_suld_1d_v2i32_trap:
- return NVPTXISD::Suld1DV2I32Trap;
- case Intrinsic::nvvm_suld_1d_v2i64_trap:
- return NVPTXISD::Suld1DV2I64Trap;
- case Intrinsic::nvvm_suld_1d_v4i8_trap:
- return NVPTXISD::Suld1DV4I8Trap;
- case Intrinsic::nvvm_suld_1d_v4i16_trap:
- return NVPTXISD::Suld1DV4I16Trap;
- case Intrinsic::nvvm_suld_1d_v4i32_trap:
- return NVPTXISD::Suld1DV4I32Trap;
- case Intrinsic::nvvm_suld_1d_array_i8_trap:
- return NVPTXISD::Suld1DArrayI8Trap;
- case Intrinsic::nvvm_suld_1d_array_i16_trap:
- return NVPTXISD::Suld1DArrayI16Trap;
- case Intrinsic::nvvm_suld_1d_array_i32_trap:
- return NVPTXISD::Suld1DArrayI32Trap;
- case Intrinsic::nvvm_suld_1d_array_i64_trap:
- return NVPTXISD::Suld1DArrayI64Trap;
- case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
- return NVPTXISD::Suld1DArrayV2I8Trap;
- case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
- return NVPTXISD::Suld1DArrayV2I16Trap;
- case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
- return NVPTXISD::Suld1DArrayV2I32Trap;
- case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
- return NVPTXISD::Suld1DArrayV2I64Trap;
- case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
- return NVPTXISD::Suld1DArrayV4I8Trap;
- case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
- return NVPTXISD::Suld1DArrayV4I16Trap;
- case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
- return NVPTXISD::Suld1DArrayV4I32Trap;
- case Intrinsic::nvvm_suld_2d_i8_trap:
- return NVPTXISD::Suld2DI8Trap;
- case Intrinsic::nvvm_suld_2d_i16_trap:
- return NVPTXISD::Suld2DI16Trap;
- case Intrinsic::nvvm_suld_2d_i32_trap:
- return NVPTXISD::Suld2DI32Trap;
- case Intrinsic::nvvm_suld_2d_i64_trap:
- return NVPTXISD::Suld2DI64Trap;
- case Intrinsic::nvvm_suld_2d_v2i8_trap:
- return NVPTXISD::Suld2DV2I8Trap;
- case Intrinsic::nvvm_suld_2d_v2i16_trap:
- return NVPTXISD::Suld2DV2I16Trap;
- case Intrinsic::nvvm_suld_2d_v2i32_trap:
- return NVPTXISD::Suld2DV2I32Trap;
- case Intrinsic::nvvm_suld_2d_v2i64_trap:
- return NVPTXISD::Suld2DV2I64Trap;
- case Intrinsic::nvvm_suld_2d_v4i8_trap:
- return NVPTXISD::Suld2DV4I8Trap;
- case Intrinsic::nvvm_suld_2d_v4i16_trap:
- return NVPTXISD::Suld2DV4I16Trap;
- case Intrinsic::nvvm_suld_2d_v4i32_trap:
- return NVPTXISD::Suld2DV4I32Trap;
- case Intrinsic::nvvm_suld_2d_array_i8_trap:
- return NVPTXISD::Suld2DArrayI8Trap;
- case Intrinsic::nvvm_suld_2d_array_i16_trap:
- return NVPTXISD::Suld2DArrayI16Trap;
- case Intrinsic::nvvm_suld_2d_array_i32_trap:
- return NVPTXISD::Suld2DArrayI32Trap;
- case Intrinsic::nvvm_suld_2d_array_i64_trap:
- return NVPTXISD::Suld2DArrayI64Trap;
- case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
- return NVPTXISD::Suld2DArrayV2I8Trap;
- case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
- return NVPTXISD::Suld2DArrayV2I16Trap;
- case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
- return NVPTXISD::Suld2DArrayV2I32Trap;
- case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
- return NVPTXISD::Suld2DArrayV2I64Trap;
- case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
- return NVPTXISD::Suld2DArrayV4I8Trap;
- case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
- return NVPTXISD::Suld2DArrayV4I16Trap;
- case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
- return NVPTXISD::Suld2DArrayV4I32Trap;
- case Intrinsic::nvvm_suld_3d_i8_trap:
- return NVPTXISD::Suld3DI8Trap;
- case Intrinsic::nvvm_suld_3d_i16_trap:
- return NVPTXISD::Suld3DI16Trap;
- case Intrinsic::nvvm_suld_3d_i32_trap:
- return NVPTXISD::Suld3DI32Trap;
- case Intrinsic::nvvm_suld_3d_i64_trap:
- return NVPTXISD::Suld3DI64Trap;
- case Intrinsic::nvvm_suld_3d_v2i8_trap:
- return NVPTXISD::Suld3DV2I8Trap;
- case Intrinsic::nvvm_suld_3d_v2i16_trap:
- return NVPTXISD::Suld3DV2I16Trap;
- case Intrinsic::nvvm_suld_3d_v2i32_trap:
- return NVPTXISD::Suld3DV2I32Trap;
- case Intrinsic::nvvm_suld_3d_v2i64_trap:
- return NVPTXISD::Suld3DV2I64Trap;
- case Intrinsic::nvvm_suld_3d_v4i8_trap:
- return NVPTXISD::Suld3DV4I8Trap;
- case Intrinsic::nvvm_suld_3d_v4i16_trap:
- return NVPTXISD::Suld3DV4I16Trap;
- case Intrinsic::nvvm_suld_3d_v4i32_trap:
- return NVPTXISD::Suld3DV4I32Trap;
- case Intrinsic::nvvm_suld_1d_i8_zero:
- return NVPTXISD::Suld1DI8Zero;
- case Intrinsic::nvvm_suld_1d_i16_zero:
- return NVPTXISD::Suld1DI16Zero;
- case Intrinsic::nvvm_suld_1d_i32_zero:
- return NVPTXISD::Suld1DI32Zero;
- case Intrinsic::nvvm_suld_1d_i64_zero:
- return NVPTXISD::Suld1DI64Zero;
- case Intrinsic::nvvm_suld_1d_v2i8_zero:
- return NVPTXISD::Suld1DV2I8Zero;
- case Intrinsic::nvvm_suld_1d_v2i16_zero:
- return NVPTXISD::Suld1DV2I16Zero;
- case Intrinsic::nvvm_suld_1d_v2i32_zero:
- return NVPTXISD::Suld1DV2I32Zero;
- case Intrinsic::nvvm_suld_1d_v2i64_zero:
- return NVPTXISD::Suld1DV2I64Zero;
- case Intrinsic::nvvm_suld_1d_v4i8_zero:
- return NVPTXISD::Suld1DV4I8Zero;
- case Intrinsic::nvvm_suld_1d_v4i16_zero:
- return NVPTXISD::Suld1DV4I16Zero;
- case Intrinsic::nvvm_suld_1d_v4i32_zero:
- return NVPTXISD::Suld1DV4I32Zero;
- case Intrinsic::nvvm_suld_1d_array_i8_zero:
- return NVPTXISD::Suld1DArrayI8Zero;
- case Intrinsic::nvvm_suld_1d_array_i16_zero:
- return NVPTXISD::Suld1DArrayI16Zero;
- case Intrinsic::nvvm_suld_1d_array_i32_zero:
- return NVPTXISD::Suld1DArrayI32Zero;
- case Intrinsic::nvvm_suld_1d_array_i64_zero:
- return NVPTXISD::Suld1DArrayI64Zero;
- case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
- return NVPTXISD::Suld1DArrayV2I8Zero;
- case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
- return NVPTXISD::Suld1DArrayV2I16Zero;
- case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
- return NVPTXISD::Suld1DArrayV2I32Zero;
- case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
- return NVPTXISD::Suld1DArrayV2I64Zero;
- case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
- return NVPTXISD::Suld1DArrayV4I8Zero;
- case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
- return NVPTXISD::Suld1DArrayV4I16Zero;
- case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
- return NVPTXISD::Suld1DArrayV4I32Zero;
- case Intrinsic::nvvm_suld_2d_i8_zero:
- return NVPTXISD::Suld2DI8Zero;
- case Intrinsic::nvvm_suld_2d_i16_zero:
- return NVPTXISD::Suld2DI16Zero;
- case Intrinsic::nvvm_suld_2d_i32_zero:
- return NVPTXISD::Suld2DI32Zero;
- case Intrinsic::nvvm_suld_2d_i64_zero:
- return NVPTXISD::Suld2DI64Zero;
- case Intrinsic::nvvm_suld_2d_v2i8_zero:
- return NVPTXISD::Suld2DV2I8Zero;
- case Intrinsic::nvvm_suld_2d_v2i16_zero:
- return NVPTXISD::Suld2DV2I16Zero;
- case Intrinsic::nvvm_suld_2d_v2i32_zero:
- return NVPTXISD::Suld2DV2I32Zero;
- case Intrinsic::nvvm_suld_2d_v2i64_zero:
- return NVPTXISD::Suld2DV2I64Zero;
- case Intrinsic::nvvm_suld_2d_v4i8_zero:
- return NVPTXISD::Suld2DV4I8Zero;
- case Intrinsic::nvvm_suld_2d_v4i16_zero:
- return NVPTXISD::Suld2DV4I16Zero;
- case Intrinsic::nvvm_suld_2d_v4i32_zero:
- return NVPTXISD::Suld2DV4I32Zero;
- case Intrinsic::nvvm_suld_2d_array_i8_zero:
- return NVPTXISD::Suld2DArrayI8Zero;
- case Intrinsic::nvvm_suld_2d_array_i16_zero:
- return NVPTXISD::Suld2DArrayI16Zero;
- case Intrinsic::nvvm_suld_2d_array_i32_zero:
- return NVPTXISD::Suld2DArrayI32Zero;
- case Intrinsic::nvvm_suld_2d_array_i64_zero:
- return NVPTXISD::Suld2DArrayI64Zero;
- case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
- return NVPTXISD::Suld2DArrayV2I8Zero;
- case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
- return NVPTXISD::Suld2DArrayV2I16Zero;
- case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
- return NVPTXISD::Suld2DArrayV2I32Zero;
- case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
- return NVPTXISD::Suld2DArrayV2I64Zero;
- case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
- return NVPTXISD::Suld2DArrayV4I8Zero;
- case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
- return NVPTXISD::Suld2DArrayV4I16Zero;
- case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
- return NVPTXISD::Suld2DArrayV4I32Zero;
- case Intrinsic::nvvm_suld_3d_i8_zero:
- return NVPTXISD::Suld3DI8Zero;
- case Intrinsic::nvvm_suld_3d_i16_zero:
- return NVPTXISD::Suld3DI16Zero;
- case Intrinsic::nvvm_suld_3d_i32_zero:
- return NVPTXISD::Suld3DI32Zero;
- case Intrinsic::nvvm_suld_3d_i64_zero:
- return NVPTXISD::Suld3DI64Zero;
- case Intrinsic::nvvm_suld_3d_v2i8_zero:
- return NVPTXISD::Suld3DV2I8Zero;
- case Intrinsic::nvvm_suld_3d_v2i16_zero:
- return NVPTXISD::Suld3DV2I16Zero;
- case Intrinsic::nvvm_suld_3d_v2i32_zero:
- return NVPTXISD::Suld3DV2I32Zero;
- case Intrinsic::nvvm_suld_3d_v2i64_zero:
- return NVPTXISD::Suld3DV2I64Zero;
- case Intrinsic::nvvm_suld_3d_v4i8_zero:
- return NVPTXISD::Suld3DV4I8Zero;
- case Intrinsic::nvvm_suld_3d_v4i16_zero:
- return NVPTXISD::Suld3DV4I16Zero;
- case Intrinsic::nvvm_suld_3d_v4i32_zero:
- return NVPTXISD::Suld3DV4I32Zero;
- }
- }
- // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
- // TgtMemIntrinsic
- // because we need the information that is only available in the "Value" type
- // of destination
- // pointer. In particular, the address space information.
- bool NVPTXTargetLowering::getTgtMemIntrinsic(
- IntrinsicInfo &Info, const CallInst &I,
- MachineFunction &MF, unsigned Intrinsic) const {
- switch (Intrinsic) {
- default:
- return false;
- case Intrinsic::nvvm_match_all_sync_i32p:
- case Intrinsic::nvvm_match_all_sync_i64p:
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
- // in order to model data exchange with other threads, but perform no real
- // memory accesses.
- Info.memVT = MVT::i1;
- // Our result depends on both our and other thread's arguments.
- Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
- return true;
- case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
- case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
- case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
- case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
- case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
- case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
- case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
- case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
- case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
- case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
- case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
- case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
- case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::v8f16;
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOLoad;
- Info.align = Align(16);
- return true;
- }
- case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
- case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
- case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
- case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
- case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
- case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
- case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
- case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
- case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
- case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
- case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
- case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
- case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::v2i32;
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOLoad;
- Info.align = Align(8);
- return true;
- }
- case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
- case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
- case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
- case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
- case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
- case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
- case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
- case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
- case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
- case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
- case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
- case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
- case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
- case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
- case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
- case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
- case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
- case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
- case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
- case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
- case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
- case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
- case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::v4i32;
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOLoad;
- Info.align = Align(16);
- return true;
- }
- case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
- case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
- case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
- case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
- case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
- case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
- case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
- case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
- case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
- case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
- case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
- case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
- case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
- case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
- case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
- case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
- case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
- case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
- case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
- case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
- case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
- case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::i32;
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOLoad;
- Info.align = Align(4);
- return true;
- }
- case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
- case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
- case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
- case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
- case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
- case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
- case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::v4f16;
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOLoad;
- Info.align = Align(16);
- return true;
- }
- case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
- case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
- case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
- case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
- case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
- case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
- case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
- case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
- case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
- case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
- case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::v8f32;
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOLoad;
- Info.align = Align(16);
- return true;
- }
- case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
- case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
- case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
- case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
- case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
- case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
- case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
- case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
- case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
- case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
- case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::v8i32;
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOLoad;
- Info.align = Align(16);
- return true;
- }
- case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
- case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
- case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
- case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
- case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
- case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
- case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
- case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
- case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
- case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::v2i32;
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOLoad;
- Info.align = Align(8);
- return true;
- }
- case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
- case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
- case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
- case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
- case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
- case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
- case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
- case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::f64;
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOLoad;
- Info.align = Align(8);
- return true;
- }
- case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
- case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
- case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
- case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::v2f64;
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOLoad;
- Info.align = Align(16);
- return true;
- }
- case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
- case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
- case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
- case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
- case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
- case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
- case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
- Info.opc = ISD::INTRINSIC_VOID;
- Info.memVT = MVT::v4f16;
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOStore;
- Info.align = Align(16);
- return true;
- }
- case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
- case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
- case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
- case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
- case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
- case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
- case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
- case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
- case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
- case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
- case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
- Info.opc = ISD::INTRINSIC_VOID;
- Info.memVT = MVT::v8f32;
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOStore;
- Info.align = Align(16);
- return true;
- }
- case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
- case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
- case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
- case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
- case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
- case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
- case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
- case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
- case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
- Info.opc = ISD::INTRINSIC_VOID;
- Info.memVT = MVT::v8i32;
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOStore;
- Info.align = Align(16);
- return true;
- }
- case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
- case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
- case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
- case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
- case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
- case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
- case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
- case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
- Info.opc = ISD::INTRINSIC_VOID;
- Info.memVT = MVT::v2i32;
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOStore;
- Info.align = Align(8);
- return true;
- }
- case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
- case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
- case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
- case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
- Info.opc = ISD::INTRINSIC_VOID;
- Info.memVT = MVT::v2f64;
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOStore;
- Info.align = Align(16);
- return true;
- }
- case Intrinsic::nvvm_atomic_load_inc_32:
- case Intrinsic::nvvm_atomic_load_dec_32:
- case Intrinsic::nvvm_atomic_add_gen_f_cta:
- case Intrinsic::nvvm_atomic_add_gen_f_sys:
- case Intrinsic::nvvm_atomic_add_gen_i_cta:
- case Intrinsic::nvvm_atomic_add_gen_i_sys:
- case Intrinsic::nvvm_atomic_and_gen_i_cta:
- case Intrinsic::nvvm_atomic_and_gen_i_sys:
- case Intrinsic::nvvm_atomic_cas_gen_i_cta:
- case Intrinsic::nvvm_atomic_cas_gen_i_sys:
- case Intrinsic::nvvm_atomic_dec_gen_i_cta:
- case Intrinsic::nvvm_atomic_dec_gen_i_sys:
- case Intrinsic::nvvm_atomic_inc_gen_i_cta:
- case Intrinsic::nvvm_atomic_inc_gen_i_sys:
- case Intrinsic::nvvm_atomic_max_gen_i_cta:
- case Intrinsic::nvvm_atomic_max_gen_i_sys:
- case Intrinsic::nvvm_atomic_min_gen_i_cta:
- case Intrinsic::nvvm_atomic_min_gen_i_sys:
- case Intrinsic::nvvm_atomic_or_gen_i_cta:
- case Intrinsic::nvvm_atomic_or_gen_i_sys:
- case Intrinsic::nvvm_atomic_exch_gen_i_cta:
- case Intrinsic::nvvm_atomic_exch_gen_i_sys:
- case Intrinsic::nvvm_atomic_xor_gen_i_cta:
- case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
- auto &DL = I.getModule()->getDataLayout();
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = getValueType(DL, I.getType());
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
- Info.align.reset();
- return true;
- }
- case Intrinsic::nvvm_ldu_global_i:
- case Intrinsic::nvvm_ldu_global_f:
- case Intrinsic::nvvm_ldu_global_p: {
- auto &DL = I.getModule()->getDataLayout();
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
- Info.memVT = getValueType(DL, I.getType());
- else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
- Info.memVT = getPointerTy(DL);
- else
- Info.memVT = getValueType(DL, I.getType());
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOLoad;
- Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
- return true;
- }
- case Intrinsic::nvvm_ldg_global_i:
- case Intrinsic::nvvm_ldg_global_f:
- case Intrinsic::nvvm_ldg_global_p: {
- auto &DL = I.getModule()->getDataLayout();
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
- Info.memVT = getValueType(DL, I.getType());
- else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
- Info.memVT = getPointerTy(DL);
- else
- Info.memVT = getValueType(DL, I.getType());
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOLoad;
- Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
- return true;
- }
- case Intrinsic::nvvm_tex_1d_v4f32_s32:
- case Intrinsic::nvvm_tex_1d_v4f32_f32:
- case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
- case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
- case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
- case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
- case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
- case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
- case Intrinsic::nvvm_tex_2d_v4f32_s32:
- case Intrinsic::nvvm_tex_2d_v4f32_f32:
- case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
- case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
- case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
- case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
- case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
- case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
- case Intrinsic::nvvm_tex_3d_v4f32_s32:
- case Intrinsic::nvvm_tex_3d_v4f32_f32:
- case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
- case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
- case Intrinsic::nvvm_tex_cube_v4f32_f32:
- case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
- case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
- case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
- case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
- case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
- case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
- case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
- case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
- case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
- case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
- case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
- case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
- case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
- case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
- case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
- case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
- Info.opc = getOpcForTextureInstr(Intrinsic);
- Info.memVT = MVT::v4f32;
- Info.ptrVal = nullptr;
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOLoad;
- Info.align = Align(16);
- return true;
- case Intrinsic::nvvm_tex_1d_v4s32_s32:
- case Intrinsic::nvvm_tex_1d_v4s32_f32:
- case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
- case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
- case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
- case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
- case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
- case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
- case Intrinsic::nvvm_tex_2d_v4s32_s32:
- case Intrinsic::nvvm_tex_2d_v4s32_f32:
- case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
- case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
- case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
- case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
- case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
- case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
- case Intrinsic::nvvm_tex_3d_v4s32_s32:
- case Intrinsic::nvvm_tex_3d_v4s32_f32:
- case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
- case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
- case Intrinsic::nvvm_tex_cube_v4s32_f32:
- case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
- case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
- case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
- case Intrinsic::nvvm_tex_cube_v4u32_f32:
- case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
- case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
- case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
- case Intrinsic::nvvm_tex_1d_v4u32_s32:
- case Intrinsic::nvvm_tex_1d_v4u32_f32:
- case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
- case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
- case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
- case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
- case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
- case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
- case Intrinsic::nvvm_tex_2d_v4u32_s32:
- case Intrinsic::nvvm_tex_2d_v4u32_f32:
- case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
- case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
- case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
- case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
- case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
- case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
- case Intrinsic::nvvm_tex_3d_v4u32_s32:
- case Intrinsic::nvvm_tex_3d_v4u32_f32:
- case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
- case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
- case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
- case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
- case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
- case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
- case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
- case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
- case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
- case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
- case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
- case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
- case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
- case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
- case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
- case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
- case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
- case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
- case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
- case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
- case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
- case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
- case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
- case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
- case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
- case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
- case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
- case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
- Info.opc = getOpcForTextureInstr(Intrinsic);
- Info.memVT = MVT::v4i32;
- Info.ptrVal = nullptr;
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOLoad;
- Info.align = Align(16);
- return true;
- case Intrinsic::nvvm_suld_1d_i8_clamp:
- case Intrinsic::nvvm_suld_1d_v2i8_clamp:
- case Intrinsic::nvvm_suld_1d_v4i8_clamp:
- case Intrinsic::nvvm_suld_1d_array_i8_clamp:
- case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
- case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
- case Intrinsic::nvvm_suld_2d_i8_clamp:
- case Intrinsic::nvvm_suld_2d_v2i8_clamp:
- case Intrinsic::nvvm_suld_2d_v4i8_clamp:
- case Intrinsic::nvvm_suld_2d_array_i8_clamp:
- case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
- case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
- case Intrinsic::nvvm_suld_3d_i8_clamp:
- case Intrinsic::nvvm_suld_3d_v2i8_clamp:
- case Intrinsic::nvvm_suld_3d_v4i8_clamp:
- case Intrinsic::nvvm_suld_1d_i8_trap:
- case Intrinsic::nvvm_suld_1d_v2i8_trap:
- case Intrinsic::nvvm_suld_1d_v4i8_trap:
- case Intrinsic::nvvm_suld_1d_array_i8_trap:
- case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
- case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
- case Intrinsic::nvvm_suld_2d_i8_trap:
- case Intrinsic::nvvm_suld_2d_v2i8_trap:
- case Intrinsic::nvvm_suld_2d_v4i8_trap:
- case Intrinsic::nvvm_suld_2d_array_i8_trap:
- case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
- case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
- case Intrinsic::nvvm_suld_3d_i8_trap:
- case Intrinsic::nvvm_suld_3d_v2i8_trap:
- case Intrinsic::nvvm_suld_3d_v4i8_trap:
- case Intrinsic::nvvm_suld_1d_i8_zero:
- case Intrinsic::nvvm_suld_1d_v2i8_zero:
- case Intrinsic::nvvm_suld_1d_v4i8_zero:
- case Intrinsic::nvvm_suld_1d_array_i8_zero:
- case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
- case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
- case Intrinsic::nvvm_suld_2d_i8_zero:
- case Intrinsic::nvvm_suld_2d_v2i8_zero:
- case Intrinsic::nvvm_suld_2d_v4i8_zero:
- case Intrinsic::nvvm_suld_2d_array_i8_zero:
- case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
- case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
- case Intrinsic::nvvm_suld_3d_i8_zero:
- case Intrinsic::nvvm_suld_3d_v2i8_zero:
- case Intrinsic::nvvm_suld_3d_v4i8_zero:
- Info.opc = getOpcForSurfaceInstr(Intrinsic);
- Info.memVT = MVT::i8;
- Info.ptrVal = nullptr;
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOLoad;
- Info.align = Align(16);
- return true;
- case Intrinsic::nvvm_suld_1d_i16_clamp:
- case Intrinsic::nvvm_suld_1d_v2i16_clamp:
- case Intrinsic::nvvm_suld_1d_v4i16_clamp:
- case Intrinsic::nvvm_suld_1d_array_i16_clamp:
- case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
- case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
- case Intrinsic::nvvm_suld_2d_i16_clamp:
- case Intrinsic::nvvm_suld_2d_v2i16_clamp:
- case Intrinsic::nvvm_suld_2d_v4i16_clamp:
- case Intrinsic::nvvm_suld_2d_array_i16_clamp:
- case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
- case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
- case Intrinsic::nvvm_suld_3d_i16_clamp:
- case Intrinsic::nvvm_suld_3d_v2i16_clamp:
- case Intrinsic::nvvm_suld_3d_v4i16_clamp:
- case Intrinsic::nvvm_suld_1d_i16_trap:
- case Intrinsic::nvvm_suld_1d_v2i16_trap:
- case Intrinsic::nvvm_suld_1d_v4i16_trap:
- case Intrinsic::nvvm_suld_1d_array_i16_trap:
- case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
- case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
- case Intrinsic::nvvm_suld_2d_i16_trap:
- case Intrinsic::nvvm_suld_2d_v2i16_trap:
- case Intrinsic::nvvm_suld_2d_v4i16_trap:
- case Intrinsic::nvvm_suld_2d_array_i16_trap:
- case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
- case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
- case Intrinsic::nvvm_suld_3d_i16_trap:
- case Intrinsic::nvvm_suld_3d_v2i16_trap:
- case Intrinsic::nvvm_suld_3d_v4i16_trap:
- case Intrinsic::nvvm_suld_1d_i16_zero:
- case Intrinsic::nvvm_suld_1d_v2i16_zero:
- case Intrinsic::nvvm_suld_1d_v4i16_zero:
- case Intrinsic::nvvm_suld_1d_array_i16_zero:
- case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
- case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
- case Intrinsic::nvvm_suld_2d_i16_zero:
- case Intrinsic::nvvm_suld_2d_v2i16_zero:
- case Intrinsic::nvvm_suld_2d_v4i16_zero:
- case Intrinsic::nvvm_suld_2d_array_i16_zero:
- case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
- case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
- case Intrinsic::nvvm_suld_3d_i16_zero:
- case Intrinsic::nvvm_suld_3d_v2i16_zero:
- case Intrinsic::nvvm_suld_3d_v4i16_zero:
- Info.opc = getOpcForSurfaceInstr(Intrinsic);
- Info.memVT = MVT::i16;
- Info.ptrVal = nullptr;
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOLoad;
- Info.align = Align(16);
- return true;
- case Intrinsic::nvvm_suld_1d_i32_clamp:
- case Intrinsic::nvvm_suld_1d_v2i32_clamp:
- case Intrinsic::nvvm_suld_1d_v4i32_clamp:
- case Intrinsic::nvvm_suld_1d_array_i32_clamp:
- case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
- case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
- case Intrinsic::nvvm_suld_2d_i32_clamp:
- case Intrinsic::nvvm_suld_2d_v2i32_clamp:
- case Intrinsic::nvvm_suld_2d_v4i32_clamp:
- case Intrinsic::nvvm_suld_2d_array_i32_clamp:
- case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
- case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
- case Intrinsic::nvvm_suld_3d_i32_clamp:
- case Intrinsic::nvvm_suld_3d_v2i32_clamp:
- case Intrinsic::nvvm_suld_3d_v4i32_clamp:
- case Intrinsic::nvvm_suld_1d_i32_trap:
- case Intrinsic::nvvm_suld_1d_v2i32_trap:
- case Intrinsic::nvvm_suld_1d_v4i32_trap:
- case Intrinsic::nvvm_suld_1d_array_i32_trap:
- case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
- case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
- case Intrinsic::nvvm_suld_2d_i32_trap:
- case Intrinsic::nvvm_suld_2d_v2i32_trap:
- case Intrinsic::nvvm_suld_2d_v4i32_trap:
- case Intrinsic::nvvm_suld_2d_array_i32_trap:
- case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
- case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
- case Intrinsic::nvvm_suld_3d_i32_trap:
- case Intrinsic::nvvm_suld_3d_v2i32_trap:
- case Intrinsic::nvvm_suld_3d_v4i32_trap:
- case Intrinsic::nvvm_suld_1d_i32_zero:
- case Intrinsic::nvvm_suld_1d_v2i32_zero:
- case Intrinsic::nvvm_suld_1d_v4i32_zero:
- case Intrinsic::nvvm_suld_1d_array_i32_zero:
- case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
- case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
- case Intrinsic::nvvm_suld_2d_i32_zero:
- case Intrinsic::nvvm_suld_2d_v2i32_zero:
- case Intrinsic::nvvm_suld_2d_v4i32_zero:
- case Intrinsic::nvvm_suld_2d_array_i32_zero:
- case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
- case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
- case Intrinsic::nvvm_suld_3d_i32_zero:
- case Intrinsic::nvvm_suld_3d_v2i32_zero:
- case Intrinsic::nvvm_suld_3d_v4i32_zero:
- Info.opc = getOpcForSurfaceInstr(Intrinsic);
- Info.memVT = MVT::i32;
- Info.ptrVal = nullptr;
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOLoad;
- Info.align = Align(16);
- return true;
- case Intrinsic::nvvm_suld_1d_i64_clamp:
- case Intrinsic::nvvm_suld_1d_v2i64_clamp:
- case Intrinsic::nvvm_suld_1d_array_i64_clamp:
- case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
- case Intrinsic::nvvm_suld_2d_i64_clamp:
- case Intrinsic::nvvm_suld_2d_v2i64_clamp:
- case Intrinsic::nvvm_suld_2d_array_i64_clamp:
- case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
- case Intrinsic::nvvm_suld_3d_i64_clamp:
- case Intrinsic::nvvm_suld_3d_v2i64_clamp:
- case Intrinsic::nvvm_suld_1d_i64_trap:
- case Intrinsic::nvvm_suld_1d_v2i64_trap:
- case Intrinsic::nvvm_suld_1d_array_i64_trap:
- case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
- case Intrinsic::nvvm_suld_2d_i64_trap:
- case Intrinsic::nvvm_suld_2d_v2i64_trap:
- case Intrinsic::nvvm_suld_2d_array_i64_trap:
- case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
- case Intrinsic::nvvm_suld_3d_i64_trap:
- case Intrinsic::nvvm_suld_3d_v2i64_trap:
- case Intrinsic::nvvm_suld_1d_i64_zero:
- case Intrinsic::nvvm_suld_1d_v2i64_zero:
- case Intrinsic::nvvm_suld_1d_array_i64_zero:
- case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
- case Intrinsic::nvvm_suld_2d_i64_zero:
- case Intrinsic::nvvm_suld_2d_v2i64_zero:
- case Intrinsic::nvvm_suld_2d_array_i64_zero:
- case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
- case Intrinsic::nvvm_suld_3d_i64_zero:
- case Intrinsic::nvvm_suld_3d_v2i64_zero:
- Info.opc = getOpcForSurfaceInstr(Intrinsic);
- Info.memVT = MVT::i64;
- Info.ptrVal = nullptr;
- Info.offset = 0;
- Info.flags = MachineMemOperand::MOLoad;
- Info.align = Align(16);
- return true;
- }
- return false;
- }
- /// isLegalAddressingMode - Return true if the addressing mode represented
- /// by AM is legal for this target, for a load/store of the specified type.
- /// Used to guide target specific optimizations, like loop strength reduction
- /// (LoopStrengthReduce.cpp) and memory optimization for address mode
- /// (CodeGenPrepare.cpp)
- bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
- const AddrMode &AM, Type *Ty,
- unsigned AS, Instruction *I) const {
- // AddrMode - This represents an addressing mode of:
- // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
- //
- // The legal address modes are
- // - [avar]
- // - [areg]
- // - [areg+immoff]
- // - [immAddr]
- if (AM.BaseGV) {
- return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
- }
- switch (AM.Scale) {
- case 0: // "r", "r+i" or "i" is allowed
- break;
- case 1:
- if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
- return false;
- // Otherwise we have r+i.
- break;
- default:
- // No scale > 1 is allowed
- return false;
- }
- return true;
- }
- //===----------------------------------------------------------------------===//
- // NVPTX Inline Assembly Support
- //===----------------------------------------------------------------------===//
- /// getConstraintType - Given a constraint letter, return the type of
- /// constraint it is for this target.
- NVPTXTargetLowering::ConstraintType
- NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
- if (Constraint.size() == 1) {
- switch (Constraint[0]) {
- default:
- break;
- case 'b':
- case 'r':
- case 'h':
- case 'c':
- case 'l':
- case 'f':
- case 'd':
- case '0':
- case 'N':
- return C_RegisterClass;
- }
- }
- return TargetLowering::getConstraintType(Constraint);
- }
- std::pair<unsigned, const TargetRegisterClass *>
- NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
- StringRef Constraint,
- MVT VT) const {
- if (Constraint.size() == 1) {
- switch (Constraint[0]) {
- case 'b':
- return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
- case 'c':
- return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
- case 'h':
- return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
- case 'r':
- return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
- case 'l':
- case 'N':
- return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
- case 'f':
- return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
- case 'd':
- return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
- }
- }
- return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
- }
- //===----------------------------------------------------------------------===//
- // NVPTX DAG Combining
- //===----------------------------------------------------------------------===//
- bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
- CodeGenOpt::Level OptLevel) const {
- // Always honor command-line argument
- if (FMAContractLevelOpt.getNumOccurrences() > 0)
- return FMAContractLevelOpt > 0;
- // Do not contract if we're not optimizing the code.
- if (OptLevel == 0)
- return false;
- // Honor TargetOptions flags that explicitly say fusion is okay.
- if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast)
- return true;
- return allowUnsafeFPMath(MF);
- }
- bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const {
- // Honor TargetOptions flags that explicitly say unsafe math is okay.
- if (MF.getTarget().Options.UnsafeFPMath)
- return true;
- // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
- const Function &F = MF.getFunction();
- return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
- }
- /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
- /// operands N0 and N1. This is a helper for PerformADDCombine that is
- /// called with the default operands, and if that fails, with commuted
- /// operands.
- static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
- TargetLowering::DAGCombinerInfo &DCI,
- const NVPTXSubtarget &Subtarget,
- CodeGenOpt::Level OptLevel) {
- SelectionDAG &DAG = DCI.DAG;
- // Skip non-integer, non-scalar case
- EVT VT=N0.getValueType();
- if (VT.isVector())
- return SDValue();
- // fold (add (mul a, b), c) -> (mad a, b, c)
- //
- if (N0.getOpcode() == ISD::MUL) {
- assert (VT.isInteger());
- // For integer:
- // Since integer multiply-add costs the same as integer multiply
- // but is more costly than integer add, do the fusion only when
- // the mul is only used in the add.
- if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
- !N0.getNode()->hasOneUse())
- return SDValue();
- // Do the folding
- return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
- N0.getOperand(0), N0.getOperand(1), N1);
- }
- else if (N0.getOpcode() == ISD::FMUL) {
- if (VT == MVT::f32 || VT == MVT::f64) {
- const auto *TLI = static_cast<const NVPTXTargetLowering *>(
- &DAG.getTargetLoweringInfo());
- if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
- return SDValue();
- // For floating point:
- // Do the fusion only when the mul has less than 5 uses and all
- // are add.
- // The heuristic is that if a use is not an add, then that use
- // cannot be fused into fma, therefore mul is still needed anyway.
- // If there are more than 4 uses, even if they are all add, fusing
- // them will increase register pressue.
- //
- int numUses = 0;
- int nonAddCount = 0;
- for (const SDNode *User : N0.getNode()->uses()) {
- numUses++;
- if (User->getOpcode() != ISD::FADD)
- ++nonAddCount;
- }
- if (numUses >= 5)
- return SDValue();
- if (nonAddCount) {
- int orderNo = N->getIROrder();
- int orderNo2 = N0.getNode()->getIROrder();
- // simple heuristics here for considering potential register
- // pressure, the logics here is that the differnce are used
- // to measure the distance between def and use, the longer distance
- // more likely cause register pressure.
- if (orderNo - orderNo2 < 500)
- return SDValue();
- // Now, check if at least one of the FMUL's operands is live beyond the node N,
- // which guarantees that the FMA will not increase register pressure at node N.
- bool opIsLive = false;
- const SDNode *left = N0.getOperand(0).getNode();
- const SDNode *right = N0.getOperand(1).getNode();
- if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
- opIsLive = true;
- if (!opIsLive)
- for (const SDNode *User : left->uses()) {
- int orderNo3 = User->getIROrder();
- if (orderNo3 > orderNo) {
- opIsLive = true;
- break;
- }
- }
- if (!opIsLive)
- for (const SDNode *User : right->uses()) {
- int orderNo3 = User->getIROrder();
- if (orderNo3 > orderNo) {
- opIsLive = true;
- break;
- }
- }
- if (!opIsLive)
- return SDValue();
- }
- return DAG.getNode(ISD::FMA, SDLoc(N), VT,
- N0.getOperand(0), N0.getOperand(1), N1);
- }
- }
- return SDValue();
- }
- /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
- ///
- static SDValue PerformADDCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- const NVPTXSubtarget &Subtarget,
- CodeGenOpt::Level OptLevel) {
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
- // First try with the default operand order.
- if (SDValue Result =
- PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
- return Result;
- // If that didn't work, try again with the operands commuted.
- return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
- }
- static SDValue PerformANDCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
- // The type legalizer turns a vector load of i8 values into a zextload to i16
- // registers, optionally ANY_EXTENDs it (if target type is integer),
- // and ANDs off the high 8 bits. Since we turn this load into a
- // target-specific DAG node, the DAG combiner fails to eliminate these AND
- // nodes. Do that here.
- SDValue Val = N->getOperand(0);
- SDValue Mask = N->getOperand(1);
- if (isa<ConstantSDNode>(Val)) {
- std::swap(Val, Mask);
- }
- SDValue AExt;
- // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
- if (Val.getOpcode() == ISD::ANY_EXTEND) {
- AExt = Val;
- Val = Val->getOperand(0);
- }
- if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
- Val = Val->getOperand(0);
- }
- if (Val->getOpcode() == NVPTXISD::LoadV2 ||
- Val->getOpcode() == NVPTXISD::LoadV4) {
- ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
- if (!MaskCnst) {
- // Not an AND with a constant
- return SDValue();
- }
- uint64_t MaskVal = MaskCnst->getZExtValue();
- if (MaskVal != 0xff) {
- // Not an AND that chops off top 8 bits
- return SDValue();
- }
- MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
- if (!Mem) {
- // Not a MemSDNode?!?
- return SDValue();
- }
- EVT MemVT = Mem->getMemoryVT();
- if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
- // We only handle the i8 case
- return SDValue();
- }
- unsigned ExtType =
- cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
- getZExtValue();
- if (ExtType == ISD::SEXTLOAD) {
- // If for some reason the load is a sextload, the and is needed to zero
- // out the high 8 bits
- return SDValue();
- }
- bool AddTo = false;
- if (AExt.getNode() != nullptr) {
- // Re-insert the ext as a zext.
- Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
- AExt.getValueType(), Val);
- AddTo = true;
- }
- // If we get here, the AND is unnecessary. Just replace it with the load
- DCI.CombineTo(N, Val, AddTo);
- }
- return SDValue();
- }
- static SDValue PerformREMCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- CodeGenOpt::Level OptLevel) {
- assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
- // Don't do anything at less than -O2.
- if (OptLevel < CodeGenOpt::Default)
- return SDValue();
- SelectionDAG &DAG = DCI.DAG;
- SDLoc DL(N);
- EVT VT = N->getValueType(0);
- bool IsSigned = N->getOpcode() == ISD::SREM;
- unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
- const SDValue &Num = N->getOperand(0);
- const SDValue &Den = N->getOperand(1);
- for (const SDNode *U : Num->uses()) {
- if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
- U->getOperand(1) == Den) {
- // Num % Den -> Num - (Num / Den) * Den
- return DAG.getNode(ISD::SUB, DL, VT, Num,
- DAG.getNode(ISD::MUL, DL, VT,
- DAG.getNode(DivOpc, DL, VT, Num, Den),
- Den));
- }
- }
- return SDValue();
- }
- enum OperandSignedness {
- Signed = 0,
- Unsigned,
- Unknown
- };
- /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
- /// that can be demoted to \p OptSize bits without loss of information. The
- /// signedness of the operand, if determinable, is placed in \p S.
- static bool IsMulWideOperandDemotable(SDValue Op,
- unsigned OptSize,
- OperandSignedness &S) {
- S = Unknown;
- if (Op.getOpcode() == ISD::SIGN_EXTEND ||
- Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
- EVT OrigVT = Op.getOperand(0).getValueType();
- if (OrigVT.getFixedSizeInBits() <= OptSize) {
- S = Signed;
- return true;
- }
- } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
- EVT OrigVT = Op.getOperand(0).getValueType();
- if (OrigVT.getFixedSizeInBits() <= OptSize) {
- S = Unsigned;
- return true;
- }
- }
- return false;
- }
- /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
- /// be demoted to \p OptSize bits without loss of information. If the operands
- /// contain a constant, it should appear as the RHS operand. The signedness of
- /// the operands is placed in \p IsSigned.
- static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
- unsigned OptSize,
- bool &IsSigned) {
- OperandSignedness LHSSign;
- // The LHS operand must be a demotable op
- if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
- return false;
- // We should have been able to determine the signedness from the LHS
- if (LHSSign == Unknown)
- return false;
- IsSigned = (LHSSign == Signed);
- // The RHS can be a demotable op or a constant
- if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
- const APInt &Val = CI->getAPIntValue();
- if (LHSSign == Unsigned) {
- return Val.isIntN(OptSize);
- } else {
- return Val.isSignedIntN(OptSize);
- }
- } else {
- OperandSignedness RHSSign;
- if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
- return false;
- return LHSSign == RHSSign;
- }
- }
- /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
- /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
- /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
- /// amount.
- static SDValue TryMULWIDECombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
- EVT MulType = N->getValueType(0);
- if (MulType != MVT::i32 && MulType != MVT::i64) {
- return SDValue();
- }
- SDLoc DL(N);
- unsigned OptSize = MulType.getSizeInBits() >> 1;
- SDValue LHS = N->getOperand(0);
- SDValue RHS = N->getOperand(1);
- // Canonicalize the multiply so the constant (if any) is on the right
- if (N->getOpcode() == ISD::MUL) {
- if (isa<ConstantSDNode>(LHS)) {
- std::swap(LHS, RHS);
- }
- }
- // If we have a SHL, determine the actual multiply amount
- if (N->getOpcode() == ISD::SHL) {
- ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
- if (!ShlRHS) {
- return SDValue();
- }
- APInt ShiftAmt = ShlRHS->getAPIntValue();
- unsigned BitWidth = MulType.getSizeInBits();
- if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
- APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
- RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
- } else {
- return SDValue();
- }
- }
- bool Signed;
- // Verify that our operands are demotable
- if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
- return SDValue();
- }
- EVT DemotedVT;
- if (MulType == MVT::i32) {
- DemotedVT = MVT::i16;
- } else {
- DemotedVT = MVT::i32;
- }
- // Truncate the operands to the correct size. Note that these are just for
- // type consistency and will (likely) be eliminated in later phases.
- SDValue TruncLHS =
- DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
- SDValue TruncRHS =
- DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
- unsigned Opc;
- if (Signed) {
- Opc = NVPTXISD::MUL_WIDE_SIGNED;
- } else {
- Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
- }
- return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
- }
- /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
- static SDValue PerformMULCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- CodeGenOpt::Level OptLevel) {
- if (OptLevel > 0) {
- // Try mul.wide combining at OptLevel > 0
- if (SDValue Ret = TryMULWIDECombine(N, DCI))
- return Ret;
- }
- return SDValue();
- }
- /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
- static SDValue PerformSHLCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- CodeGenOpt::Level OptLevel) {
- if (OptLevel > 0) {
- // Try mul.wide combining at OptLevel > 0
- if (SDValue Ret = TryMULWIDECombine(N, DCI))
- return Ret;
- }
- return SDValue();
- }
- static SDValue PerformSETCCCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
- EVT CCType = N->getValueType(0);
- SDValue A = N->getOperand(0);
- SDValue B = N->getOperand(1);
- if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16)
- return SDValue();
- SDLoc DL(N);
- // setp.f16x2 returns two scalar predicates, which we need to
- // convert back to v2i1. The returned result will be scalarized by
- // the legalizer, but the comparison will remain a single vector
- // instruction.
- SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL,
- DCI.DAG.getVTList(MVT::i1, MVT::i1),
- {A, B, N->getOperand(2)});
- return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
- CCNode.getValue(1));
- }
- SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
- DAGCombinerInfo &DCI) const {
- CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel();
- switch (N->getOpcode()) {
- default: break;
- case ISD::ADD:
- case ISD::FADD:
- return PerformADDCombine(N, DCI, STI, OptLevel);
- case ISD::MUL:
- return PerformMULCombine(N, DCI, OptLevel);
- case ISD::SHL:
- return PerformSHLCombine(N, DCI, OptLevel);
- case ISD::AND:
- return PerformANDCombine(N, DCI);
- case ISD::UREM:
- case ISD::SREM:
- return PerformREMCombine(N, DCI, OptLevel);
- case ISD::SETCC:
- return PerformSETCCCombine(N, DCI);
- }
- return SDValue();
- }
- /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
- static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &Results) {
- EVT ResVT = N->getValueType(0);
- SDLoc DL(N);
- assert(ResVT.isVector() && "Vector load must have vector type");
- // We only handle "native" vector sizes for now, e.g. <4 x double> is not
- // legal. We can (and should) split that into 2 loads of <2 x double> here
- // but I'm leaving that as a TODO for now.
- assert(ResVT.isSimple() && "Can only handle simple types");
- switch (ResVT.getSimpleVT().SimpleTy) {
- default:
- return;
- case MVT::v2i8:
- case MVT::v2i16:
- case MVT::v2i32:
- case MVT::v2i64:
- case MVT::v2f16:
- case MVT::v2f32:
- case MVT::v2f64:
- case MVT::v4i8:
- case MVT::v4i16:
- case MVT::v4i32:
- case MVT::v4f16:
- case MVT::v4f32:
- case MVT::v8f16: // <4 x f16x2>
- // This is a "native" vector type
- break;
- }
- LoadSDNode *LD = cast<LoadSDNode>(N);
- Align Alignment = LD->getAlign();
- auto &TD = DAG.getDataLayout();
- Align PrefAlign = TD.getPrefTypeAlign(ResVT.getTypeForEVT(*DAG.getContext()));
- if (Alignment < PrefAlign) {
- // This load is not sufficiently aligned, so bail out and let this vector
- // load be scalarized. Note that we may still be able to emit smaller
- // vector loads. For example, if we are loading a <4 x float> with an
- // alignment of 8, this check will fail but the legalizer will try again
- // with 2 x <2 x float>, which will succeed with an alignment of 8.
- return;
- }
- EVT EltVT = ResVT.getVectorElementType();
- unsigned NumElts = ResVT.getVectorNumElements();
- // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
- // Therefore, we must ensure the type is legal. For i1 and i8, we set the
- // loaded type to i16 and propagate the "real" type as the memory type.
- bool NeedTrunc = false;
- if (EltVT.getSizeInBits() < 16) {
- EltVT = MVT::i16;
- NeedTrunc = true;
- }
- unsigned Opcode = 0;
- SDVTList LdResVTs;
- bool LoadF16x2 = false;
- switch (NumElts) {
- default:
- return;
- case 2:
- Opcode = NVPTXISD::LoadV2;
- LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
- break;
- case 4: {
- Opcode = NVPTXISD::LoadV4;
- EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
- LdResVTs = DAG.getVTList(ListVTs);
- break;
- }
- case 8: {
- // v8f16 is a special case. PTX doesn't have ld.v8.f16
- // instruction. Instead, we split the vector into v2f16 chunks and
- // load them with ld.v4.b32.
- assert(EltVT == MVT::f16 && "Unsupported v8 vector type.");
- LoadF16x2 = true;
- Opcode = NVPTXISD::LoadV4;
- EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16,
- MVT::Other};
- LdResVTs = DAG.getVTList(ListVTs);
- break;
- }
- }
- // Copy regular operands
- SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
- // The select routine does not have access to the LoadSDNode instance, so
- // pass along the extension information
- OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
- SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
- LD->getMemoryVT(),
- LD->getMemOperand());
- SmallVector<SDValue, 8> ScalarRes;
- if (LoadF16x2) {
- // Split v2f16 subvectors back into individual elements.
- NumElts /= 2;
- for (unsigned i = 0; i < NumElts; ++i) {
- SDValue SubVector = NewLD.getValue(i);
- SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
- DAG.getIntPtrConstant(0, DL));
- SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
- DAG.getIntPtrConstant(1, DL));
- ScalarRes.push_back(E0);
- ScalarRes.push_back(E1);
- }
- } else {
- for (unsigned i = 0; i < NumElts; ++i) {
- SDValue Res = NewLD.getValue(i);
- if (NeedTrunc)
- Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
- ScalarRes.push_back(Res);
- }
- }
- SDValue LoadChain = NewLD.getValue(NumElts);
- SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
- Results.push_back(BuildVec);
- Results.push_back(LoadChain);
- }
- static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &Results) {
- SDValue Chain = N->getOperand(0);
- SDValue Intrin = N->getOperand(1);
- SDLoc DL(N);
- // Get the intrinsic ID
- unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
- switch (IntrinNo) {
- default:
- return;
- case Intrinsic::nvvm_ldg_global_i:
- case Intrinsic::nvvm_ldg_global_f:
- case Intrinsic::nvvm_ldg_global_p:
- case Intrinsic::nvvm_ldu_global_i:
- case Intrinsic::nvvm_ldu_global_f:
- case Intrinsic::nvvm_ldu_global_p: {
- EVT ResVT = N->getValueType(0);
- if (ResVT.isVector()) {
- // Vector LDG/LDU
- unsigned NumElts = ResVT.getVectorNumElements();
- EVT EltVT = ResVT.getVectorElementType();
- // Since LDU/LDG are target nodes, we cannot rely on DAG type
- // legalization.
- // Therefore, we must ensure the type is legal. For i1 and i8, we set the
- // loaded type to i16 and propagate the "real" type as the memory type.
- bool NeedTrunc = false;
- if (EltVT.getSizeInBits() < 16) {
- EltVT = MVT::i16;
- NeedTrunc = true;
- }
- unsigned Opcode = 0;
- SDVTList LdResVTs;
- switch (NumElts) {
- default:
- return;
- case 2:
- switch (IntrinNo) {
- default:
- return;
- case Intrinsic::nvvm_ldg_global_i:
- case Intrinsic::nvvm_ldg_global_f:
- case Intrinsic::nvvm_ldg_global_p:
- Opcode = NVPTXISD::LDGV2;
- break;
- case Intrinsic::nvvm_ldu_global_i:
- case Intrinsic::nvvm_ldu_global_f:
- case Intrinsic::nvvm_ldu_global_p:
- Opcode = NVPTXISD::LDUV2;
- break;
- }
- LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
- break;
- case 4: {
- switch (IntrinNo) {
- default:
- return;
- case Intrinsic::nvvm_ldg_global_i:
- case Intrinsic::nvvm_ldg_global_f:
- case Intrinsic::nvvm_ldg_global_p:
- Opcode = NVPTXISD::LDGV4;
- break;
- case Intrinsic::nvvm_ldu_global_i:
- case Intrinsic::nvvm_ldu_global_f:
- case Intrinsic::nvvm_ldu_global_p:
- Opcode = NVPTXISD::LDUV4;
- break;
- }
- EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
- LdResVTs = DAG.getVTList(ListVTs);
- break;
- }
- }
- SmallVector<SDValue, 8> OtherOps;
- // Copy regular operands
- OtherOps.push_back(Chain); // Chain
- // Skip operand 1 (intrinsic ID)
- // Others
- OtherOps.append(N->op_begin() + 2, N->op_end());
- MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
- SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
- MemSD->getMemoryVT(),
- MemSD->getMemOperand());
- SmallVector<SDValue, 4> ScalarRes;
- for (unsigned i = 0; i < NumElts; ++i) {
- SDValue Res = NewLD.getValue(i);
- if (NeedTrunc)
- Res =
- DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
- ScalarRes.push_back(Res);
- }
- SDValue LoadChain = NewLD.getValue(NumElts);
- SDValue BuildVec =
- DAG.getBuildVector(ResVT, DL, ScalarRes);
- Results.push_back(BuildVec);
- Results.push_back(LoadChain);
- } else {
- // i8 LDG/LDU
- assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
- "Custom handling of non-i8 ldu/ldg?");
- // Just copy all operands as-is
- SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
- // Force output to i16
- SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
- MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
- // We make sure the memory type is i8, which will be used during isel
- // to select the proper instruction.
- SDValue NewLD =
- DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
- MVT::i8, MemSD->getMemOperand());
- Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
- NewLD.getValue(0)));
- Results.push_back(NewLD.getValue(1));
- }
- }
- }
- }
- void NVPTXTargetLowering::ReplaceNodeResults(
- SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
- switch (N->getOpcode()) {
- default:
- report_fatal_error("Unhandled custom legalization");
- case ISD::LOAD:
- ReplaceLoadVector(N, DAG, Results);
- return;
- case ISD::INTRINSIC_W_CHAIN:
- ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
- return;
- }
- }
- // Pin NVPTXTargetObjectFile's vtables to this file.
- NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {}
- MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
- const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
- return getDataSection();
- }
|