PPCGCodeGeneration.cpp 131 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666
  1. //===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // Take a scop created by ScopInfo and map it to GPU code using the ppcg
  10. // GPU mapping strategy.
  11. //
  12. //===----------------------------------------------------------------------===//
  13. #include "polly/CodeGen/PPCGCodeGeneration.h"
  14. #include "polly/CodeGen/CodeGeneration.h"
  15. #include "polly/CodeGen/IslAst.h"
  16. #include "polly/CodeGen/IslNodeBuilder.h"
  17. #include "polly/CodeGen/PerfMonitor.h"
  18. #include "polly/CodeGen/Utils.h"
  19. #include "polly/DependenceInfo.h"
  20. #include "polly/LinkAllPasses.h"
  21. #include "polly/Options.h"
  22. #include "polly/ScopDetection.h"
  23. #include "polly/ScopInfo.h"
  24. #include "polly/Support/ISLTools.h"
  25. #include "polly/Support/SCEVValidator.h"
  26. #include "llvm/ADT/PostOrderIterator.h"
  27. #include "llvm/Analysis/TargetTransformInfo.h"
  28. #include "llvm/IR/IntrinsicsNVPTX.h"
  29. #include "llvm/IR/LegacyPassManager.h"
  30. #include "llvm/IR/Verifier.h"
  31. #include "llvm/IRReader/IRReader.h"
  32. #include "llvm/InitializePasses.h"
  33. #include "llvm/Linker/Linker.h"
  34. #include "llvm/MC/TargetRegistry.h"
  35. #include "llvm/Support/SourceMgr.h"
  36. #include "llvm/Target/TargetMachine.h"
  37. #include "llvm/Transforms/IPO/PassManagerBuilder.h"
  38. #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  39. #include "isl/union_map.h"
  40. #include <algorithm>
  41. extern "C" {
  42. #include "ppcg/cuda.h"
  43. #include "ppcg/gpu.h"
  44. #include "ppcg/ppcg.h"
  45. }
  46. #include "llvm/Support/Debug.h"
  47. using namespace polly;
  48. using namespace llvm;
  49. #define DEBUG_TYPE "polly-codegen-ppcg"
  50. static cl::opt<bool> DumpSchedule("polly-acc-dump-schedule",
  51. cl::desc("Dump the computed GPU Schedule"),
  52. cl::Hidden, cl::init(false), cl::ZeroOrMore,
  53. cl::cat(PollyCategory));
  54. static cl::opt<bool>
  55. DumpCode("polly-acc-dump-code",
  56. cl::desc("Dump C code describing the GPU mapping"), cl::Hidden,
  57. cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
  58. static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir",
  59. cl::desc("Dump the kernel LLVM-IR"),
  60. cl::Hidden, cl::init(false), cl::ZeroOrMore,
  61. cl::cat(PollyCategory));
  62. static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm",
  63. cl::desc("Dump the kernel assembly code"),
  64. cl::Hidden, cl::init(false), cl::ZeroOrMore,
  65. cl::cat(PollyCategory));
  66. static cl::opt<bool> FastMath("polly-acc-fastmath",
  67. cl::desc("Allow unsafe math optimizations"),
  68. cl::Hidden, cl::init(false), cl::ZeroOrMore,
  69. cl::cat(PollyCategory));
  70. static cl::opt<bool> SharedMemory("polly-acc-use-shared",
  71. cl::desc("Use shared memory"), cl::Hidden,
  72. cl::init(false), cl::ZeroOrMore,
  73. cl::cat(PollyCategory));
  74. static cl::opt<bool> PrivateMemory("polly-acc-use-private",
  75. cl::desc("Use private memory"), cl::Hidden,
  76. cl::init(false), cl::ZeroOrMore,
  77. cl::cat(PollyCategory));
  78. bool polly::PollyManagedMemory;
  79. static cl::opt<bool, true>
  80. XManagedMemory("polly-acc-codegen-managed-memory",
  81. cl::desc("Generate Host kernel code assuming"
  82. " that all memory has been"
  83. " declared as managed memory"),
  84. cl::location(PollyManagedMemory), cl::Hidden,
  85. cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
  86. static cl::opt<bool>
  87. FailOnVerifyModuleFailure("polly-acc-fail-on-verify-module-failure",
  88. cl::desc("Fail and generate a backtrace if"
  89. " verifyModule fails on the GPU "
  90. " kernel module."),
  91. cl::Hidden, cl::init(false), cl::ZeroOrMore,
  92. cl::cat(PollyCategory));
  93. static cl::opt<std::string> CUDALibDevice(
  94. "polly-acc-libdevice", cl::desc("Path to CUDA libdevice"), cl::Hidden,
  95. cl::init("/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.ll"),
  96. cl::ZeroOrMore, cl::cat(PollyCategory));
  97. static cl::opt<std::string>
  98. CudaVersion("polly-acc-cuda-version",
  99. cl::desc("The CUDA version to compile for"), cl::Hidden,
  100. cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory));
  101. static cl::opt<int>
  102. MinCompute("polly-acc-mincompute",
  103. cl::desc("Minimal number of compute statements to run on GPU."),
  104. cl::Hidden, cl::init(10 * 512 * 512));
  105. GPURuntime polly::GPURuntimeChoice;
  106. static cl::opt<GPURuntime, true> XGPURuntimeChoice(
  107. "polly-gpu-runtime", cl::desc("The GPU Runtime API to target"),
  108. cl::values(clEnumValN(GPURuntime::CUDA, "libcudart",
  109. "use the CUDA Runtime API"),
  110. clEnumValN(GPURuntime::OpenCL, "libopencl",
  111. "use the OpenCL Runtime API")),
  112. cl::location(polly::GPURuntimeChoice), cl::init(GPURuntime::CUDA),
  113. cl::ZeroOrMore, cl::cat(PollyCategory));
  114. GPUArch polly::GPUArchChoice;
  115. static cl::opt<GPUArch, true>
  116. XGPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"),
  117. cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64",
  118. "target NVIDIA 64-bit architecture"),
  119. clEnumValN(GPUArch::SPIR32, "spir32",
  120. "target SPIR 32-bit architecture"),
  121. clEnumValN(GPUArch::SPIR64, "spir64",
  122. "target SPIR 64-bit architecture")),
  123. cl::location(polly::GPUArchChoice),
  124. cl::init(GPUArch::NVPTX64), cl::ZeroOrMore,
  125. cl::cat(PollyCategory));
  126. extern bool polly::PerfMonitoring;
  127. /// Return a unique name for a Scop, which is the scop region with the
  128. /// function name.
  129. std::string getUniqueScopName(const Scop *S) {
  130. return "Scop Region: " + S->getNameStr() +
  131. " | Function: " + std::string(S->getFunction().getName());
  132. }
  133. /// Used to store information PPCG wants for kills. This information is
  134. /// used by live range reordering.
  135. ///
  136. /// @see computeLiveRangeReordering
  137. /// @see GPUNodeBuilder::createPPCGScop
  138. /// @see GPUNodeBuilder::createPPCGProg
  139. struct MustKillsInfo {
  140. /// Collection of all kill statements that will be sequenced at the end of
  141. /// PPCGScop->schedule.
  142. ///
  143. /// The nodes in `KillsSchedule` will be merged using `isl_schedule_set`
  144. /// which merges schedules in *arbitrary* order.
  145. /// (we don't care about the order of the kills anyway).
  146. isl::schedule KillsSchedule;
  147. /// Map from kill statement instances to scalars that need to be
  148. /// killed.
  149. ///
  150. /// We currently derive kill information for:
  151. /// 1. phi nodes. PHI nodes are not alive outside the scop and can
  152. /// consequently all be killed.
  153. /// 2. Scalar arrays that are not used outside the Scop. This is
  154. /// checked by `isScalarUsesContainedInScop`.
  155. /// [params] -> { [Stmt_phantom[] -> ref_phantom[]] -> scalar_to_kill[] }
  156. isl::union_map TaggedMustKills;
  157. /// Tagged must kills stripped of the tags.
  158. /// [params] -> { Stmt_phantom[] -> scalar_to_kill[] }
  159. isl::union_map MustKills;
  160. MustKillsInfo() : KillsSchedule() {}
  161. };
  162. /// Check if SAI's uses are entirely contained within Scop S.
  163. /// If a scalar is used only with a Scop, we are free to kill it, as no data
  164. /// can flow in/out of the value any more.
  165. /// @see computeMustKillsInfo
  166. static bool isScalarUsesContainedInScop(const Scop &S,
  167. const ScopArrayInfo *SAI) {
  168. assert(SAI->isValueKind() && "this function only deals with scalars."
  169. " Dealing with arrays required alias analysis");
  170. const Region &R = S.getRegion();
  171. for (User *U : SAI->getBasePtr()->users()) {
  172. Instruction *I = dyn_cast<Instruction>(U);
  173. assert(I && "invalid user of scop array info");
  174. if (!R.contains(I))
  175. return false;
  176. }
  177. return true;
  178. }
  179. /// Compute must-kills needed to enable live range reordering with PPCG.
  180. ///
  181. /// @params S The Scop to compute live range reordering information
  182. /// @returns live range reordering information that can be used to setup
  183. /// PPCG.
  184. static MustKillsInfo computeMustKillsInfo(const Scop &S) {
  185. const isl::space ParamSpace = S.getParamSpace();
  186. MustKillsInfo Info;
  187. // 1. Collect all ScopArrayInfo that satisfy *any* of the criteria:
  188. // 1.1 phi nodes in scop.
  189. // 1.2 scalars that are only used within the scop
  190. SmallVector<isl::id, 4> KillMemIds;
  191. for (ScopArrayInfo *SAI : S.arrays()) {
  192. if (SAI->isPHIKind() ||
  193. (SAI->isValueKind() && isScalarUsesContainedInScop(S, SAI)))
  194. KillMemIds.push_back(isl::manage(SAI->getBasePtrId().release()));
  195. }
  196. Info.TaggedMustKills = isl::union_map::empty(ParamSpace.ctx());
  197. Info.MustKills = isl::union_map::empty(ParamSpace.ctx());
  198. // Initialising KillsSchedule to `isl_set_empty` creates an empty node in the
  199. // schedule:
  200. // - filter: "[control] -> { }"
  201. // So, we choose to not create this to keep the output a little nicer,
  202. // at the cost of some code complexity.
  203. Info.KillsSchedule = {};
  204. for (isl::id &ToKillId : KillMemIds) {
  205. isl::id KillStmtId = isl::id::alloc(
  206. S.getIslCtx(),
  207. std::string("SKill_phantom_").append(ToKillId.get_name()), nullptr);
  208. // NOTE: construction of tagged_must_kill:
  209. // 2. We need to construct a map:
  210. // [param] -> { [Stmt_phantom[] -> ref_phantom[]] -> scalar_to_kill[] }
  211. // To construct this, we use `isl_map_domain_product` on 2 maps`:
  212. // 2a. StmtToScalar:
  213. // [param] -> { Stmt_phantom[] -> scalar_to_kill[] }
  214. // 2b. PhantomRefToScalar:
  215. // [param] -> { ref_phantom[] -> scalar_to_kill[] }
  216. //
  217. // Combining these with `isl_map_domain_product` gives us
  218. // TaggedMustKill:
  219. // [param] -> { [Stmt[] -> phantom_ref[]] -> scalar_to_kill[] }
  220. // 2a. [param] -> { Stmt[] -> scalar_to_kill[] }
  221. isl::map StmtToScalar = isl::map::universe(ParamSpace);
  222. StmtToScalar = StmtToScalar.set_tuple_id(isl::dim::in, isl::id(KillStmtId));
  223. StmtToScalar = StmtToScalar.set_tuple_id(isl::dim::out, isl::id(ToKillId));
  224. isl::id PhantomRefId = isl::id::alloc(
  225. S.getIslCtx(), std::string("ref_phantom") + ToKillId.get_name(),
  226. nullptr);
  227. // 2b. [param] -> { phantom_ref[] -> scalar_to_kill[] }
  228. isl::map PhantomRefToScalar = isl::map::universe(ParamSpace);
  229. PhantomRefToScalar =
  230. PhantomRefToScalar.set_tuple_id(isl::dim::in, PhantomRefId);
  231. PhantomRefToScalar =
  232. PhantomRefToScalar.set_tuple_id(isl::dim::out, ToKillId);
  233. // 2. [param] -> { [Stmt[] -> phantom_ref[]] -> scalar_to_kill[] }
  234. isl::map TaggedMustKill = StmtToScalar.domain_product(PhantomRefToScalar);
  235. Info.TaggedMustKills = Info.TaggedMustKills.unite(TaggedMustKill);
  236. // 2. [param] -> { Stmt[] -> scalar_to_kill[] }
  237. Info.MustKills = Info.TaggedMustKills.domain_factor_domain();
  238. // 3. Create the kill schedule of the form:
  239. // "[param] -> { Stmt_phantom[] }"
  240. // Then add this to Info.KillsSchedule.
  241. isl::space KillStmtSpace = ParamSpace;
  242. KillStmtSpace = KillStmtSpace.set_tuple_id(isl::dim::set, KillStmtId);
  243. isl::union_set KillStmtDomain = isl::set::universe(KillStmtSpace);
  244. isl::schedule KillSchedule = isl::schedule::from_domain(KillStmtDomain);
  245. if (!Info.KillsSchedule.is_null())
  246. Info.KillsSchedule = isl::manage(
  247. isl_schedule_set(Info.KillsSchedule.release(), KillSchedule.copy()));
  248. else
  249. Info.KillsSchedule = KillSchedule;
  250. }
  251. return Info;
  252. }
  253. /// Create the ast expressions for a ScopStmt.
  254. ///
  255. /// This function is a callback for to generate the ast expressions for each
  256. /// of the scheduled ScopStmts.
  257. static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt(
  258. void *StmtT, __isl_take isl_ast_build *Build_C,
  259. isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA,
  260. isl_id *Id, void *User),
  261. void *UserIndex,
  262. isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User),
  263. void *UserExpr) {
  264. ScopStmt *Stmt = (ScopStmt *)StmtT;
  265. if (!Stmt || !Build_C)
  266. return NULL;
  267. isl::ast_build Build = isl::manage_copy(Build_C);
  268. isl::ctx Ctx = Build.ctx();
  269. isl::id_to_ast_expr RefToExpr = isl::id_to_ast_expr::alloc(Ctx, 0);
  270. Stmt->setAstBuild(Build);
  271. for (MemoryAccess *Acc : *Stmt) {
  272. isl::map AddrFunc = Acc->getAddressFunction();
  273. AddrFunc = AddrFunc.intersect_domain(Stmt->getDomain());
  274. isl::id RefId = Acc->getId();
  275. isl::pw_multi_aff PMA = isl::pw_multi_aff::from_map(AddrFunc);
  276. isl::multi_pw_aff MPA = isl::multi_pw_aff(PMA);
  277. MPA = MPA.coalesce();
  278. MPA = isl::manage(FunctionIndex(MPA.release(), RefId.get(), UserIndex));
  279. isl::ast_expr Access = Build.access_from(MPA);
  280. Access = isl::manage(FunctionExpr(Access.release(), RefId.get(), UserExpr));
  281. RefToExpr = RefToExpr.set(RefId, Access);
  282. }
  283. return RefToExpr.release();
  284. }
  285. /// Given a LLVM Type, compute its size in bytes,
  286. static int computeSizeInBytes(const Type *T) {
  287. int bytes = T->getPrimitiveSizeInBits() / 8;
  288. if (bytes == 0)
  289. bytes = T->getScalarSizeInBits() / 8;
  290. return bytes;
  291. }
  292. /// Generate code for a GPU specific isl AST.
  293. ///
  294. /// The GPUNodeBuilder augments the general existing IslNodeBuilder, which
  295. /// generates code for general-purpose AST nodes, with special functionality
  296. /// for generating GPU specific user nodes.
  297. ///
  298. /// @see GPUNodeBuilder::createUser
  299. class GPUNodeBuilder : public IslNodeBuilder {
  300. public:
  301. GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator,
  302. const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE,
  303. DominatorTree &DT, Scop &S, BasicBlock *StartBlock,
  304. gpu_prog *Prog, GPURuntime Runtime, GPUArch Arch)
  305. : IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock),
  306. Prog(Prog), Runtime(Runtime), Arch(Arch) {
  307. getExprBuilder().setIDToSAI(&IDToSAI);
  308. }
  309. /// Create after-run-time-check initialization code.
  310. void initializeAfterRTH();
  311. /// Finalize the generated scop.
  312. void finalize() override;
  313. /// Track if the full build process was successful.
  314. ///
  315. /// This value is set to false, if throughout the build process an error
  316. /// occurred which prevents us from generating valid GPU code.
  317. bool BuildSuccessful = true;
  318. /// The maximal number of loops surrounding a sequential kernel.
  319. unsigned DeepestSequential = 0;
  320. /// The maximal number of loops surrounding a parallel kernel.
  321. unsigned DeepestParallel = 0;
  322. /// Return the name to set for the ptx_kernel.
  323. std::string getKernelFuncName(int Kernel_id);
  324. private:
  325. /// A vector of array base pointers for which a new ScopArrayInfo was created.
  326. ///
  327. /// This vector is used to delete the ScopArrayInfo when it is not needed any
  328. /// more.
  329. std::vector<Value *> LocalArrays;
  330. /// A map from ScopArrays to their corresponding device allocations.
  331. std::map<ScopArrayInfo *, Value *> DeviceAllocations;
  332. /// The current GPU context.
  333. Value *GPUContext;
  334. /// The set of isl_ids allocated in the kernel
  335. std::vector<isl_id *> KernelIds;
  336. /// A module containing GPU code.
  337. ///
  338. /// This pointer is only set in case we are currently generating GPU code.
  339. std::unique_ptr<Module> GPUModule;
  340. /// The GPU program we generate code for.
  341. gpu_prog *Prog;
  342. /// The GPU Runtime implementation to use (OpenCL or CUDA).
  343. GPURuntime Runtime;
  344. /// The GPU Architecture to target.
  345. GPUArch Arch;
  346. /// Class to free isl_ids.
  347. class IslIdDeleter {
  348. public:
  349. void operator()(__isl_take isl_id *Id) { isl_id_free(Id); };
  350. };
  351. /// A set containing all isl_ids allocated in a GPU kernel.
  352. ///
  353. /// By releasing this set all isl_ids will be freed.
  354. std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs;
  355. IslExprBuilder::IDToScopArrayInfoTy IDToSAI;
  356. /// Create code for user-defined AST nodes.
  357. ///
  358. /// These AST nodes can be of type:
  359. ///
  360. /// - ScopStmt: A computational statement (TODO)
  361. /// - Kernel: A GPU kernel call (TODO)
  362. /// - Data-Transfer: A GPU <-> CPU data-transfer
  363. /// - In-kernel synchronization
  364. /// - In-kernel memory copy statement
  365. ///
  366. /// @param UserStmt The ast node to generate code for.
  367. void createUser(__isl_take isl_ast_node *UserStmt) override;
  368. void createFor(__isl_take isl_ast_node *Node) override;
  369. enum DataDirection { HOST_TO_DEVICE, DEVICE_TO_HOST };
  370. /// Create code for a data transfer statement
  371. ///
  372. /// @param TransferStmt The data transfer statement.
  373. /// @param Direction The direction in which to transfer data.
  374. void createDataTransfer(__isl_take isl_ast_node *TransferStmt,
  375. enum DataDirection Direction);
  376. /// Find llvm::Values referenced in GPU kernel.
  377. ///
  378. /// @param Kernel The kernel to scan for llvm::Values
  379. ///
  380. /// @returns A tuple, whose:
  381. /// - First element contains the set of values referenced by the
  382. /// kernel
  383. /// - Second element contains the set of functions referenced by the
  384. /// kernel. All functions in the set satisfy
  385. /// `isValidFunctionInKernel`.
  386. /// - Third element contains loops that have induction variables
  387. /// which are used in the kernel, *and* these loops are *neither*
  388. /// in the scop, nor do they immediately surroung the Scop.
  389. /// See [Code generation of induction variables of loops outside
  390. /// Scops]
  391. std::tuple<SetVector<Value *>, SetVector<Function *>, SetVector<const Loop *>,
  392. isl::space>
  393. getReferencesInKernel(ppcg_kernel *Kernel);
  394. /// Compute the sizes of the execution grid for a given kernel.
  395. ///
  396. /// @param Kernel The kernel to compute grid sizes for.
  397. ///
  398. /// @returns A tuple with grid sizes for X and Y dimension
  399. std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel);
  400. /// Get the managed array pointer for sending host pointers to the device.
  401. /// \note
  402. /// This is to be used only with managed memory
  403. Value *getManagedDeviceArray(gpu_array_info *Array, ScopArrayInfo *ArrayInfo);
  404. /// Compute the sizes of the thread blocks for a given kernel.
  405. ///
  406. /// @param Kernel The kernel to compute thread block sizes for.
  407. ///
  408. /// @returns A tuple with thread block sizes for X, Y, and Z dimensions.
  409. std::tuple<Value *, Value *, Value *> getBlockSizes(ppcg_kernel *Kernel);
  410. /// Store a specific kernel launch parameter in the array of kernel launch
  411. /// parameters.
  412. ///
  413. /// @param Parameters The list of parameters in which to store.
  414. /// @param Param The kernel launch parameter to store.
  415. /// @param Index The index in the parameter list, at which to store the
  416. /// parameter.
  417. void insertStoreParameter(Instruction *Parameters, Instruction *Param,
  418. int Index);
  419. /// Create kernel launch parameters.
  420. ///
  421. /// @param Kernel The kernel to create parameters for.
  422. /// @param F The kernel function that has been created.
  423. /// @param SubtreeValues The set of llvm::Values referenced by this kernel.
  424. ///
  425. /// @returns A stack allocated array with pointers to the parameter
  426. /// values that are passed to the kernel.
  427. Value *createLaunchParameters(ppcg_kernel *Kernel, Function *F,
  428. SetVector<Value *> SubtreeValues);
  429. /// Create declarations for kernel variable.
  430. ///
  431. /// This includes shared memory declarations.
  432. ///
  433. /// @param Kernel The kernel definition to create variables for.
  434. /// @param FN The function into which to generate the variables.
  435. void createKernelVariables(ppcg_kernel *Kernel, Function *FN);
  436. /// Add CUDA annotations to module.
  437. ///
  438. /// Add a set of CUDA annotations that declares the maximal block dimensions
  439. /// that will be used to execute the CUDA kernel. This allows the NVIDIA
  440. /// PTX compiler to bound the number of allocated registers to ensure the
  441. /// resulting kernel is known to run with up to as many block dimensions
  442. /// as specified here.
  443. ///
  444. /// @param M The module to add the annotations to.
  445. /// @param BlockDimX The size of block dimension X.
  446. /// @param BlockDimY The size of block dimension Y.
  447. /// @param BlockDimZ The size of block dimension Z.
  448. void addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY,
  449. Value *BlockDimZ);
  450. /// Create GPU kernel.
  451. ///
  452. /// Code generate the kernel described by @p KernelStmt.
  453. ///
  454. /// @param KernelStmt The ast node to generate kernel code for.
  455. void createKernel(__isl_take isl_ast_node *KernelStmt);
  456. /// Generate code that computes the size of an array.
  457. ///
  458. /// @param Array The array for which to compute a size.
  459. Value *getArraySize(gpu_array_info *Array);
  460. /// Generate code to compute the minimal offset at which an array is accessed.
  461. ///
  462. /// The offset of an array is the minimal array location accessed in a scop.
  463. ///
  464. /// Example:
  465. ///
  466. /// for (long i = 0; i < 100; i++)
  467. /// A[i + 42] += ...
  468. ///
  469. /// getArrayOffset(A) results in 42.
  470. ///
  471. /// @param Array The array for which to compute the offset.
  472. /// @returns An llvm::Value that contains the offset of the array.
  473. Value *getArrayOffset(gpu_array_info *Array);
  474. /// Prepare the kernel arguments for kernel code generation
  475. ///
  476. /// @param Kernel The kernel to generate code for.
  477. /// @param FN The function created for the kernel.
  478. void prepareKernelArguments(ppcg_kernel *Kernel, Function *FN);
  479. /// Create kernel function.
  480. ///
  481. /// Create a kernel function located in a newly created module that can serve
  482. /// as target for device code generation. Set the Builder to point to the
  483. /// start block of this newly created function.
  484. ///
  485. /// @param Kernel The kernel to generate code for.
  486. /// @param SubtreeValues The set of llvm::Values referenced by this kernel.
  487. /// @param SubtreeFunctions The set of llvm::Functions referenced by this
  488. /// kernel.
  489. void createKernelFunction(ppcg_kernel *Kernel,
  490. SetVector<Value *> &SubtreeValues,
  491. SetVector<Function *> &SubtreeFunctions);
  492. /// Create the declaration of a kernel function.
  493. ///
  494. /// The kernel function takes as arguments:
  495. ///
  496. /// - One i8 pointer for each external array reference used in the kernel.
  497. /// - Host iterators
  498. /// - Parameters
  499. /// - Other LLVM Value references (TODO)
  500. ///
  501. /// @param Kernel The kernel to generate the function declaration for.
  502. /// @param SubtreeValues The set of llvm::Values referenced by this kernel.
  503. ///
  504. /// @returns The newly declared function.
  505. Function *createKernelFunctionDecl(ppcg_kernel *Kernel,
  506. SetVector<Value *> &SubtreeValues);
  507. /// Insert intrinsic functions to obtain thread and block ids.
  508. ///
  509. /// @param The kernel to generate the intrinsic functions for.
  510. void insertKernelIntrinsics(ppcg_kernel *Kernel);
  511. /// Insert function calls to retrieve the SPIR group/local ids.
  512. ///
  513. /// @param Kernel The kernel to generate the function calls for.
  514. /// @param SizeTypeIs64Bit Whether size_t of the openCl device is 64bit.
  515. void insertKernelCallsSPIR(ppcg_kernel *Kernel, bool SizeTypeIs64bit);
  516. /// Setup the creation of functions referenced by the GPU kernel.
  517. ///
  518. /// 1. Create new function declarations in GPUModule which are the same as
  519. /// SubtreeFunctions.
  520. ///
  521. /// 2. Populate IslNodeBuilder::ValueMap with mappings from
  522. /// old functions (that come from the original module) to new functions
  523. /// (that are created within GPUModule). That way, we generate references
  524. /// to the correct function (in GPUModule) in BlockGenerator.
  525. ///
  526. /// @see IslNodeBuilder::ValueMap
  527. /// @see BlockGenerator::GlobalMap
  528. /// @see BlockGenerator::getNewValue
  529. /// @see GPUNodeBuilder::getReferencesInKernel.
  530. ///
  531. /// @param SubtreeFunctions The set of llvm::Functions referenced by
  532. /// this kernel.
  533. void setupKernelSubtreeFunctions(SetVector<Function *> SubtreeFunctions);
  534. /// Create a global-to-shared or shared-to-global copy statement.
  535. ///
  536. /// @param CopyStmt The copy statement to generate code for
  537. void createKernelCopy(ppcg_kernel_stmt *CopyStmt);
  538. /// Create code for a ScopStmt called in @p Expr.
  539. ///
  540. /// @param Expr The expression containing the call.
  541. /// @param KernelStmt The kernel statement referenced in the call.
  542. void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt);
  543. /// Create an in-kernel synchronization call.
  544. void createKernelSync();
  545. /// Create a PTX assembly string for the current GPU kernel.
  546. ///
  547. /// @returns A string containing the corresponding PTX assembly code.
  548. std::string createKernelASM();
  549. /// Remove references from the dominator tree to the kernel function @p F.
  550. ///
  551. /// @param F The function to remove references to.
  552. void clearDominators(Function *F);
  553. /// Remove references from scalar evolution to the kernel function @p F.
  554. ///
  555. /// @param F The function to remove references to.
  556. void clearScalarEvolution(Function *F);
  557. /// Remove references from loop info to the kernel function @p F.
  558. ///
  559. /// @param F The function to remove references to.
  560. void clearLoops(Function *F);
  561. /// Check if the scop requires to be linked with CUDA's libdevice.
  562. bool requiresCUDALibDevice();
  563. /// Link with the NVIDIA libdevice library (if needed and available).
  564. void addCUDALibDevice();
  565. /// Finalize the generation of the kernel function.
  566. ///
  567. /// Free the LLVM-IR module corresponding to the kernel and -- if requested --
  568. /// dump its IR to stderr.
  569. ///
  570. /// @returns The Assembly string of the kernel.
  571. std::string finalizeKernelFunction();
  572. /// Finalize the generation of the kernel arguments.
  573. ///
  574. /// This function ensures that not-read-only scalars used in a kernel are
  575. /// stored back to the global memory location they are backed with before
  576. /// the kernel terminates.
  577. ///
  578. /// @params Kernel The kernel to finalize kernel arguments for.
  579. void finalizeKernelArguments(ppcg_kernel *Kernel);
  580. /// Create code that allocates memory to store arrays on device.
  581. void allocateDeviceArrays();
  582. /// Create code to prepare the managed device pointers.
  583. void prepareManagedDeviceArrays();
  584. /// Free all allocated device arrays.
  585. void freeDeviceArrays();
  586. /// Create a call to initialize the GPU context.
  587. ///
  588. /// @returns A pointer to the newly initialized context.
  589. Value *createCallInitContext();
  590. /// Create a call to get the device pointer for a kernel allocation.
  591. ///
  592. /// @param Allocation The Polly GPU allocation
  593. ///
  594. /// @returns The device parameter corresponding to this allocation.
  595. Value *createCallGetDevicePtr(Value *Allocation);
  596. /// Create a call to free the GPU context.
  597. ///
  598. /// @param Context A pointer to an initialized GPU context.
  599. void createCallFreeContext(Value *Context);
  600. /// Create a call to allocate memory on the device.
  601. ///
  602. /// @param Size The size of memory to allocate
  603. ///
  604. /// @returns A pointer that identifies this allocation.
  605. Value *createCallAllocateMemoryForDevice(Value *Size);
  606. /// Create a call to free a device array.
  607. ///
  608. /// @param Array The device array to free.
  609. void createCallFreeDeviceMemory(Value *Array);
  610. /// Create a call to copy data from host to device.
  611. ///
  612. /// @param HostPtr A pointer to the host data that should be copied.
  613. /// @param DevicePtr A device pointer specifying the location to copy to.
  614. void createCallCopyFromHostToDevice(Value *HostPtr, Value *DevicePtr,
  615. Value *Size);
  616. /// Create a call to copy data from device to host.
  617. ///
  618. /// @param DevicePtr A pointer to the device data that should be copied.
  619. /// @param HostPtr A host pointer specifying the location to copy to.
  620. void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr,
  621. Value *Size);
  622. /// Create a call to synchronize Host & Device.
  623. /// \note
  624. /// This is to be used only with managed memory.
  625. void createCallSynchronizeDevice();
  626. /// Create a call to get a kernel from an assembly string.
  627. ///
  628. /// @param Buffer The string describing the kernel.
  629. /// @param Entry The name of the kernel function to call.
  630. ///
  631. /// @returns A pointer to a kernel object
  632. Value *createCallGetKernel(Value *Buffer, Value *Entry);
  633. /// Create a call to free a GPU kernel.
  634. ///
  635. /// @param GPUKernel THe kernel to free.
  636. void createCallFreeKernel(Value *GPUKernel);
  637. /// Create a call to launch a GPU kernel.
  638. ///
  639. /// @param GPUKernel The kernel to launch.
  640. /// @param GridDimX The size of the first grid dimension.
  641. /// @param GridDimY The size of the second grid dimension.
  642. /// @param GridBlockX The size of the first block dimension.
  643. /// @param GridBlockY The size of the second block dimension.
  644. /// @param GridBlockZ The size of the third block dimension.
  645. /// @param Parameters A pointer to an array that contains itself pointers to
  646. /// the parameter values passed for each kernel argument.
  647. void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX,
  648. Value *GridDimY, Value *BlockDimX,
  649. Value *BlockDimY, Value *BlockDimZ,
  650. Value *Parameters);
  651. };
  652. std::string GPUNodeBuilder::getKernelFuncName(int Kernel_id) {
  653. return "FUNC_" + S.getFunction().getName().str() + "_SCOP_" +
  654. std::to_string(S.getID()) + "_KERNEL_" + std::to_string(Kernel_id);
  655. }
  656. void GPUNodeBuilder::initializeAfterRTH() {
  657. BasicBlock *NewBB = SplitBlock(Builder.GetInsertBlock(),
  658. &*Builder.GetInsertPoint(), &DT, &LI);
  659. NewBB->setName("polly.acc.initialize");
  660. Builder.SetInsertPoint(&NewBB->front());
  661. GPUContext = createCallInitContext();
  662. if (!PollyManagedMemory)
  663. allocateDeviceArrays();
  664. else
  665. prepareManagedDeviceArrays();
  666. }
  667. void GPUNodeBuilder::finalize() {
  668. if (!PollyManagedMemory)
  669. freeDeviceArrays();
  670. createCallFreeContext(GPUContext);
  671. IslNodeBuilder::finalize();
  672. }
  673. void GPUNodeBuilder::allocateDeviceArrays() {
  674. assert(!PollyManagedMemory &&
  675. "Managed memory will directly send host pointers "
  676. "to the kernel. There is no need for device arrays");
  677. isl_ast_build *Build = isl_ast_build_from_context(S.getContext().release());
  678. for (int i = 0; i < Prog->n_array; ++i) {
  679. gpu_array_info *Array = &Prog->array[i];
  680. auto *ScopArray = (ScopArrayInfo *)Array->user;
  681. std::string DevArrayName("p_dev_array_");
  682. DevArrayName.append(Array->name);
  683. Value *ArraySize = getArraySize(Array);
  684. Value *Offset = getArrayOffset(Array);
  685. if (Offset)
  686. ArraySize = Builder.CreateSub(
  687. ArraySize,
  688. Builder.CreateMul(Offset,
  689. Builder.getInt64(ScopArray->getElemSizeInBytes())));
  690. const SCEV *SizeSCEV = SE.getSCEV(ArraySize);
  691. // It makes no sense to have an array of size 0. The CUDA API will
  692. // throw an error anyway if we invoke `cuMallocManaged` with size `0`. We
  693. // choose to be defensive and catch this at the compile phase. It is
  694. // most likely that we are doing something wrong with size computation.
  695. if (SizeSCEV->isZero()) {
  696. errs() << getUniqueScopName(&S)
  697. << " has computed array size 0: " << *ArraySize
  698. << " | for array: " << *(ScopArray->getBasePtr())
  699. << ". This is illegal, exiting.\n";
  700. report_fatal_error("array size was computed to be 0");
  701. }
  702. Value *DevArray = createCallAllocateMemoryForDevice(ArraySize);
  703. DevArray->setName(DevArrayName);
  704. DeviceAllocations[ScopArray] = DevArray;
  705. }
  706. isl_ast_build_free(Build);
  707. }
  708. void GPUNodeBuilder::prepareManagedDeviceArrays() {
  709. assert(PollyManagedMemory &&
  710. "Device array most only be prepared in managed-memory mode");
  711. for (int i = 0; i < Prog->n_array; ++i) {
  712. gpu_array_info *Array = &Prog->array[i];
  713. ScopArrayInfo *ScopArray = (ScopArrayInfo *)Array->user;
  714. Value *HostPtr;
  715. if (gpu_array_is_scalar(Array))
  716. HostPtr = BlockGen.getOrCreateAlloca(ScopArray);
  717. else
  718. HostPtr = ScopArray->getBasePtr();
  719. HostPtr = getLatestValue(HostPtr);
  720. Value *Offset = getArrayOffset(Array);
  721. if (Offset) {
  722. HostPtr = Builder.CreatePointerCast(
  723. HostPtr, ScopArray->getElementType()->getPointerTo());
  724. HostPtr = Builder.CreateGEP(ScopArray->getElementType(), HostPtr, Offset);
  725. }
  726. HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());
  727. DeviceAllocations[ScopArray] = HostPtr;
  728. }
  729. }
  730. void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX,
  731. Value *BlockDimY, Value *BlockDimZ) {
  732. auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations");
  733. for (auto &F : *M) {
  734. if (F.getCallingConv() != CallingConv::PTX_Kernel)
  735. continue;
  736. Value *V[] = {BlockDimX, BlockDimY, BlockDimZ};
  737. Metadata *Elements[] = {
  738. ValueAsMetadata::get(&F), MDString::get(M->getContext(), "maxntidx"),
  739. ValueAsMetadata::get(V[0]), MDString::get(M->getContext(), "maxntidy"),
  740. ValueAsMetadata::get(V[1]), MDString::get(M->getContext(), "maxntidz"),
  741. ValueAsMetadata::get(V[2]),
  742. };
  743. MDNode *Node = MDNode::get(M->getContext(), Elements);
  744. AnnotationNode->addOperand(Node);
  745. }
  746. }
  747. void GPUNodeBuilder::freeDeviceArrays() {
  748. assert(!PollyManagedMemory && "Managed memory does not use device arrays");
  749. for (auto &Array : DeviceAllocations)
  750. createCallFreeDeviceMemory(Array.second);
  751. }
  752. Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) {
  753. const char *Name = "polly_getKernel";
  754. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  755. Function *F = M->getFunction(Name);
  756. // If F is not available, declare it.
  757. if (!F) {
  758. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  759. std::vector<Type *> Args;
  760. Args.push_back(Builder.getInt8PtrTy());
  761. Args.push_back(Builder.getInt8PtrTy());
  762. FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
  763. F = Function::Create(Ty, Linkage, Name, M);
  764. }
  765. return Builder.CreateCall(F, {Buffer, Entry});
  766. }
  767. Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) {
  768. const char *Name = "polly_getDevicePtr";
  769. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  770. Function *F = M->getFunction(Name);
  771. // If F is not available, declare it.
  772. if (!F) {
  773. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  774. std::vector<Type *> Args;
  775. Args.push_back(Builder.getInt8PtrTy());
  776. FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
  777. F = Function::Create(Ty, Linkage, Name, M);
  778. }
  779. return Builder.CreateCall(F, {Allocation});
  780. }
  781. void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX,
  782. Value *GridDimY, Value *BlockDimX,
  783. Value *BlockDimY, Value *BlockDimZ,
  784. Value *Parameters) {
  785. const char *Name = "polly_launchKernel";
  786. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  787. Function *F = M->getFunction(Name);
  788. // If F is not available, declare it.
  789. if (!F) {
  790. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  791. std::vector<Type *> Args;
  792. Args.push_back(Builder.getInt8PtrTy());
  793. Args.push_back(Builder.getInt32Ty());
  794. Args.push_back(Builder.getInt32Ty());
  795. Args.push_back(Builder.getInt32Ty());
  796. Args.push_back(Builder.getInt32Ty());
  797. Args.push_back(Builder.getInt32Ty());
  798. Args.push_back(Builder.getInt8PtrTy());
  799. FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
  800. F = Function::Create(Ty, Linkage, Name, M);
  801. }
  802. Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
  803. BlockDimZ, Parameters});
  804. }
  805. void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) {
  806. const char *Name = "polly_freeKernel";
  807. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  808. Function *F = M->getFunction(Name);
  809. // If F is not available, declare it.
  810. if (!F) {
  811. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  812. std::vector<Type *> Args;
  813. Args.push_back(Builder.getInt8PtrTy());
  814. FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
  815. F = Function::Create(Ty, Linkage, Name, M);
  816. }
  817. Builder.CreateCall(F, {GPUKernel});
  818. }
  819. void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) {
  820. assert(!PollyManagedMemory &&
  821. "Managed memory does not allocate or free memory "
  822. "for device");
  823. const char *Name = "polly_freeDeviceMemory";
  824. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  825. Function *F = M->getFunction(Name);
  826. // If F is not available, declare it.
  827. if (!F) {
  828. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  829. std::vector<Type *> Args;
  830. Args.push_back(Builder.getInt8PtrTy());
  831. FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
  832. F = Function::Create(Ty, Linkage, Name, M);
  833. }
  834. Builder.CreateCall(F, {Array});
  835. }
  836. Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) {
  837. assert(!PollyManagedMemory &&
  838. "Managed memory does not allocate or free memory "
  839. "for device");
  840. const char *Name = "polly_allocateMemoryForDevice";
  841. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  842. Function *F = M->getFunction(Name);
  843. // If F is not available, declare it.
  844. if (!F) {
  845. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  846. std::vector<Type *> Args;
  847. Args.push_back(Builder.getInt64Ty());
  848. FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
  849. F = Function::Create(Ty, Linkage, Name, M);
  850. }
  851. return Builder.CreateCall(F, {Size});
  852. }
  853. void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData,
  854. Value *DeviceData,
  855. Value *Size) {
  856. assert(!PollyManagedMemory &&
  857. "Managed memory does not transfer memory between "
  858. "device and host");
  859. const char *Name = "polly_copyFromHostToDevice";
  860. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  861. Function *F = M->getFunction(Name);
  862. // If F is not available, declare it.
  863. if (!F) {
  864. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  865. std::vector<Type *> Args;
  866. Args.push_back(Builder.getInt8PtrTy());
  867. Args.push_back(Builder.getInt8PtrTy());
  868. Args.push_back(Builder.getInt64Ty());
  869. FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
  870. F = Function::Create(Ty, Linkage, Name, M);
  871. }
  872. Builder.CreateCall(F, {HostData, DeviceData, Size});
  873. }
  874. void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData,
  875. Value *HostData,
  876. Value *Size) {
  877. assert(!PollyManagedMemory &&
  878. "Managed memory does not transfer memory between "
  879. "device and host");
  880. const char *Name = "polly_copyFromDeviceToHost";
  881. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  882. Function *F = M->getFunction(Name);
  883. // If F is not available, declare it.
  884. if (!F) {
  885. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  886. std::vector<Type *> Args;
  887. Args.push_back(Builder.getInt8PtrTy());
  888. Args.push_back(Builder.getInt8PtrTy());
  889. Args.push_back(Builder.getInt64Ty());
  890. FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
  891. F = Function::Create(Ty, Linkage, Name, M);
  892. }
  893. Builder.CreateCall(F, {DeviceData, HostData, Size});
  894. }
  895. void GPUNodeBuilder::createCallSynchronizeDevice() {
  896. assert(PollyManagedMemory && "explicit synchronization is only necessary for "
  897. "managed memory");
  898. const char *Name = "polly_synchronizeDevice";
  899. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  900. Function *F = M->getFunction(Name);
  901. // If F is not available, declare it.
  902. if (!F) {
  903. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  904. FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false);
  905. F = Function::Create(Ty, Linkage, Name, M);
  906. }
  907. Builder.CreateCall(F);
  908. }
  909. Value *GPUNodeBuilder::createCallInitContext() {
  910. const char *Name;
  911. switch (Runtime) {
  912. case GPURuntime::CUDA:
  913. Name = "polly_initContextCUDA";
  914. break;
  915. case GPURuntime::OpenCL:
  916. Name = "polly_initContextCL";
  917. break;
  918. }
  919. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  920. Function *F = M->getFunction(Name);
  921. // If F is not available, declare it.
  922. if (!F) {
  923. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  924. std::vector<Type *> Args;
  925. FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
  926. F = Function::Create(Ty, Linkage, Name, M);
  927. }
  928. return Builder.CreateCall(F, {});
  929. }
  930. void GPUNodeBuilder::createCallFreeContext(Value *Context) {
  931. const char *Name = "polly_freeContext";
  932. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  933. Function *F = M->getFunction(Name);
  934. // If F is not available, declare it.
  935. if (!F) {
  936. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  937. std::vector<Type *> Args;
  938. Args.push_back(Builder.getInt8PtrTy());
  939. FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
  940. F = Function::Create(Ty, Linkage, Name, M);
  941. }
  942. Builder.CreateCall(F, {Context});
  943. }
  944. /// Check if one string is a prefix of another.
  945. ///
  946. /// @param String The string in which to look for the prefix.
  947. /// @param Prefix The prefix to look for.
  948. static bool isPrefix(std::string String, std::string Prefix) {
  949. return String.find(Prefix) == 0;
  950. }
  951. Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) {
  952. isl::ast_build Build = isl::ast_build::from_context(S.getContext());
  953. Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size);
  954. if (!gpu_array_is_scalar(Array)) {
  955. isl::multi_pw_aff ArrayBound = isl::manage_copy(Array->bound);
  956. isl::pw_aff OffsetDimZero = ArrayBound.at(0);
  957. isl::ast_expr Res = Build.expr_from(OffsetDimZero);
  958. for (unsigned int i = 1; i < Array->n_index; i++) {
  959. isl::pw_aff Bound_I = ArrayBound.at(i);
  960. isl::ast_expr Expr = Build.expr_from(Bound_I);
  961. Res = Res.mul(Expr);
  962. }
  963. Value *NumElements = ExprBuilder.create(Res.release());
  964. if (NumElements->getType() != ArraySize->getType())
  965. NumElements = Builder.CreateSExt(NumElements, ArraySize->getType());
  966. ArraySize = Builder.CreateMul(ArraySize, NumElements);
  967. }
  968. return ArraySize;
  969. }
  970. Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) {
  971. if (gpu_array_is_scalar(Array))
  972. return nullptr;
  973. isl::ast_build Build = isl::ast_build::from_context(S.getContext());
  974. isl::set Min = isl::manage_copy(Array->extent).lexmin();
  975. isl::set ZeroSet = isl::set::universe(Min.get_space());
  976. for (unsigned i : rangeIslSize(0, Min.tuple_dim()))
  977. ZeroSet = ZeroSet.fix_si(isl::dim::set, i, 0);
  978. if (Min.is_subset(ZeroSet)) {
  979. return nullptr;
  980. }
  981. isl::ast_expr Result = isl::ast_expr::from_val(isl::val(Min.ctx(), 0));
  982. for (unsigned i : rangeIslSize(0, Min.tuple_dim())) {
  983. if (i > 0) {
  984. isl::pw_aff Bound_I =
  985. isl::manage(isl_multi_pw_aff_get_pw_aff(Array->bound, i - 1));
  986. isl::ast_expr BExpr = Build.expr_from(Bound_I);
  987. Result = Result.mul(BExpr);
  988. }
  989. isl::pw_aff DimMin = Min.dim_min(i);
  990. isl::ast_expr MExpr = Build.expr_from(DimMin);
  991. Result = Result.add(MExpr);
  992. }
  993. return ExprBuilder.create(Result.release());
  994. }
  995. Value *GPUNodeBuilder::getManagedDeviceArray(gpu_array_info *Array,
  996. ScopArrayInfo *ArrayInfo) {
  997. assert(PollyManagedMemory && "Only used when you wish to get a host "
  998. "pointer for sending data to the kernel, "
  999. "with managed memory");
  1000. std::map<ScopArrayInfo *, Value *>::iterator it;
  1001. it = DeviceAllocations.find(ArrayInfo);
  1002. assert(it != DeviceAllocations.end() &&
  1003. "Device array expected to be available");
  1004. return it->second;
  1005. }
  1006. void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt,
  1007. enum DataDirection Direction) {
  1008. assert(!PollyManagedMemory && "Managed memory needs no data transfers");
  1009. isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt);
  1010. isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0);
  1011. isl_id *Id = isl_ast_expr_get_id(Arg);
  1012. auto Array = (gpu_array_info *)isl_id_get_user(Id);
  1013. auto ScopArray = (ScopArrayInfo *)(Array->user);
  1014. Value *Size = getArraySize(Array);
  1015. Value *Offset = getArrayOffset(Array);
  1016. Value *DevPtr = DeviceAllocations[ScopArray];
  1017. Value *HostPtr;
  1018. if (gpu_array_is_scalar(Array))
  1019. HostPtr = BlockGen.getOrCreateAlloca(ScopArray);
  1020. else
  1021. HostPtr = ScopArray->getBasePtr();
  1022. HostPtr = getLatestValue(HostPtr);
  1023. if (Offset) {
  1024. HostPtr = Builder.CreatePointerCast(
  1025. HostPtr, ScopArray->getElementType()->getPointerTo());
  1026. HostPtr = Builder.CreateGEP(ScopArray->getElementType(), HostPtr, Offset);
  1027. }
  1028. HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());
  1029. if (Offset) {
  1030. Size = Builder.CreateSub(
  1031. Size, Builder.CreateMul(
  1032. Offset, Builder.getInt64(ScopArray->getElemSizeInBytes())));
  1033. }
  1034. if (Direction == HOST_TO_DEVICE)
  1035. createCallCopyFromHostToDevice(HostPtr, DevPtr, Size);
  1036. else
  1037. createCallCopyFromDeviceToHost(DevPtr, HostPtr, Size);
  1038. isl_id_free(Id);
  1039. isl_ast_expr_free(Arg);
  1040. isl_ast_expr_free(Expr);
  1041. isl_ast_node_free(TransferStmt);
  1042. }
  1043. void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) {
  1044. isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt);
  1045. isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0);
  1046. isl_id *Id = isl_ast_expr_get_id(StmtExpr);
  1047. isl_id_free(Id);
  1048. isl_ast_expr_free(StmtExpr);
  1049. const char *Str = isl_id_get_name(Id);
  1050. if (!strcmp(Str, "kernel")) {
  1051. createKernel(UserStmt);
  1052. if (PollyManagedMemory)
  1053. createCallSynchronizeDevice();
  1054. isl_ast_expr_free(Expr);
  1055. return;
  1056. }
  1057. if (!strcmp(Str, "init_device")) {
  1058. initializeAfterRTH();
  1059. isl_ast_node_free(UserStmt);
  1060. isl_ast_expr_free(Expr);
  1061. return;
  1062. }
  1063. if (!strcmp(Str, "clear_device")) {
  1064. finalize();
  1065. isl_ast_node_free(UserStmt);
  1066. isl_ast_expr_free(Expr);
  1067. return;
  1068. }
  1069. if (isPrefix(Str, "to_device")) {
  1070. if (!PollyManagedMemory)
  1071. createDataTransfer(UserStmt, HOST_TO_DEVICE);
  1072. else
  1073. isl_ast_node_free(UserStmt);
  1074. isl_ast_expr_free(Expr);
  1075. return;
  1076. }
  1077. if (isPrefix(Str, "from_device")) {
  1078. if (!PollyManagedMemory) {
  1079. createDataTransfer(UserStmt, DEVICE_TO_HOST);
  1080. } else {
  1081. isl_ast_node_free(UserStmt);
  1082. }
  1083. isl_ast_expr_free(Expr);
  1084. return;
  1085. }
  1086. isl_id *Anno = isl_ast_node_get_annotation(UserStmt);
  1087. struct ppcg_kernel_stmt *KernelStmt =
  1088. (struct ppcg_kernel_stmt *)isl_id_get_user(Anno);
  1089. isl_id_free(Anno);
  1090. switch (KernelStmt->type) {
  1091. case ppcg_kernel_domain:
  1092. createScopStmt(Expr, KernelStmt);
  1093. isl_ast_node_free(UserStmt);
  1094. return;
  1095. case ppcg_kernel_copy:
  1096. createKernelCopy(KernelStmt);
  1097. isl_ast_expr_free(Expr);
  1098. isl_ast_node_free(UserStmt);
  1099. return;
  1100. case ppcg_kernel_sync:
  1101. createKernelSync();
  1102. isl_ast_expr_free(Expr);
  1103. isl_ast_node_free(UserStmt);
  1104. return;
  1105. }
  1106. isl_ast_expr_free(Expr);
  1107. isl_ast_node_free(UserStmt);
  1108. }
  1109. void GPUNodeBuilder::createFor(__isl_take isl_ast_node *Node) {
  1110. createForSequential(isl::manage(Node).as<isl::ast_node_for>(), false);
  1111. }
  1112. void GPUNodeBuilder::createKernelCopy(ppcg_kernel_stmt *KernelStmt) {
  1113. isl_ast_expr *LocalIndex = isl_ast_expr_copy(KernelStmt->u.c.local_index);
  1114. LocalIndex = isl_ast_expr_address_of(LocalIndex);
  1115. Value *LocalAddr = ExprBuilder.create(LocalIndex);
  1116. isl_ast_expr *Index = isl_ast_expr_copy(KernelStmt->u.c.index);
  1117. Index = isl_ast_expr_address_of(Index);
  1118. Value *GlobalAddr = ExprBuilder.create(Index);
  1119. Type *IndexTy = cast<PointerType>(GlobalAddr->getType())->getElementType();
  1120. if (KernelStmt->u.c.read) {
  1121. LoadInst *Load = Builder.CreateLoad(IndexTy, GlobalAddr, "shared.read");
  1122. Builder.CreateStore(Load, LocalAddr);
  1123. } else {
  1124. LoadInst *Load = Builder.CreateLoad(IndexTy, LocalAddr, "shared.write");
  1125. Builder.CreateStore(Load, GlobalAddr);
  1126. }
  1127. }
  1128. void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr,
  1129. ppcg_kernel_stmt *KernelStmt) {
  1130. auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt;
  1131. isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr;
  1132. LoopToScevMapT LTS;
  1133. LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end());
  1134. createSubstitutions(Expr, Stmt, LTS);
  1135. if (Stmt->isBlockStmt())
  1136. BlockGen.copyStmt(*Stmt, LTS, Indexes);
  1137. else
  1138. RegionGen.copyStmt(*Stmt, LTS, Indexes);
  1139. }
  1140. void GPUNodeBuilder::createKernelSync() {
  1141. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  1142. const char *SpirName = "__gen_ocl_barrier_global";
  1143. Function *Sync;
  1144. switch (Arch) {
  1145. case GPUArch::SPIR64:
  1146. case GPUArch::SPIR32:
  1147. Sync = M->getFunction(SpirName);
  1148. // If Sync is not available, declare it.
  1149. if (!Sync) {
  1150. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  1151. std::vector<Type *> Args;
  1152. FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
  1153. Sync = Function::Create(Ty, Linkage, SpirName, M);
  1154. Sync->setCallingConv(CallingConv::SPIR_FUNC);
  1155. }
  1156. break;
  1157. case GPUArch::NVPTX64:
  1158. Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
  1159. break;
  1160. }
  1161. Builder.CreateCall(Sync, {});
  1162. }
  1163. /// Collect llvm::Values referenced from @p Node
  1164. ///
  1165. /// This function only applies to isl_ast_nodes that are user_nodes referring
  1166. /// to a ScopStmt. All other node types are ignore.
  1167. ///
  1168. /// @param Node The node to collect references for.
  1169. /// @param User A user pointer used as storage for the data that is collected.
  1170. ///
  1171. /// @returns isl_bool_true if data could be collected successfully.
  1172. isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) {
  1173. if (isl_ast_node_get_type(Node) != isl_ast_node_user)
  1174. return isl_bool_true;
  1175. isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node);
  1176. isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0);
  1177. isl_id *Id = isl_ast_expr_get_id(StmtExpr);
  1178. const char *Str = isl_id_get_name(Id);
  1179. isl_id_free(Id);
  1180. isl_ast_expr_free(StmtExpr);
  1181. isl_ast_expr_free(Expr);
  1182. if (!isPrefix(Str, "Stmt"))
  1183. return isl_bool_true;
  1184. Id = isl_ast_node_get_annotation(Node);
  1185. auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id);
  1186. auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt;
  1187. isl_id_free(Id);
  1188. addReferencesFromStmt(Stmt, User, false /* CreateScalarRefs */);
  1189. return isl_bool_true;
  1190. }
  1191. /// A list of functions that are available in NVIDIA's libdevice.
  1192. const std::set<std::string> CUDALibDeviceFunctions = {
  1193. "exp", "expf", "expl", "cos", "cosf", "sqrt", "sqrtf",
  1194. "copysign", "copysignf", "copysignl", "log", "logf", "powi", "powif"};
  1195. // A map from intrinsics to their corresponding libdevice functions.
  1196. const std::map<std::string, std::string> IntrinsicToLibdeviceFunc = {
  1197. {"llvm.exp.f64", "exp"},
  1198. {"llvm.exp.f32", "expf"},
  1199. {"llvm.powi.f64.i32", "powi"},
  1200. {"llvm.powi.f32.i32", "powif"}};
  1201. /// Return the corresponding CUDA libdevice function name @p Name.
  1202. /// Note that this function will try to convert instrinsics in the list
  1203. /// IntrinsicToLibdeviceFunc into libdevice functions.
  1204. /// This is because some intrinsics such as `exp`
  1205. /// are not supported by the NVPTX backend.
  1206. /// If this restriction of the backend is lifted, we should refactor our code
  1207. /// so that we use intrinsics whenever possible.
  1208. ///
  1209. /// Return "" if we are not compiling for CUDA.
  1210. std::string getCUDALibDeviceFuntion(StringRef NameRef) {
  1211. std::string Name = NameRef.str();
  1212. auto It = IntrinsicToLibdeviceFunc.find(Name);
  1213. if (It != IntrinsicToLibdeviceFunc.end())
  1214. return getCUDALibDeviceFuntion(It->second);
  1215. if (CUDALibDeviceFunctions.count(Name))
  1216. return ("__nv_" + Name);
  1217. return "";
  1218. }
  1219. /// Check if F is a function that we can code-generate in a GPU kernel.
  1220. static bool isValidFunctionInKernel(llvm::Function *F, bool AllowLibDevice) {
  1221. assert(F && "F is an invalid pointer");
  1222. // We string compare against the name of the function to allow
  1223. // all variants of the intrinsic "llvm.sqrt.*", "llvm.fabs", and
  1224. // "llvm.copysign".
  1225. const StringRef Name = F->getName();
  1226. if (AllowLibDevice && getCUDALibDeviceFuntion(Name).length() > 0)
  1227. return true;
  1228. return F->isIntrinsic() &&
  1229. (Name.startswith("llvm.sqrt") || Name.startswith("llvm.fabs") ||
  1230. Name.startswith("llvm.copysign"));
  1231. }
  1232. /// Do not take `Function` as a subtree value.
  1233. ///
  1234. /// We try to take the reference of all subtree values and pass them along
  1235. /// to the kernel from the host. Taking an address of any function and
  1236. /// trying to pass along is nonsensical. Only allow `Value`s that are not
  1237. /// `Function`s.
  1238. static bool isValidSubtreeValue(llvm::Value *V) { return !isa<Function>(V); }
  1239. /// Return `Function`s from `RawSubtreeValues`.
  1240. static SetVector<Function *>
  1241. getFunctionsFromRawSubtreeValues(SetVector<Value *> RawSubtreeValues,
  1242. bool AllowCUDALibDevice) {
  1243. SetVector<Function *> SubtreeFunctions;
  1244. for (Value *It : RawSubtreeValues) {
  1245. Function *F = dyn_cast<Function>(It);
  1246. if (F) {
  1247. assert(isValidFunctionInKernel(F, AllowCUDALibDevice) &&
  1248. "Code should have bailed out by "
  1249. "this point if an invalid function "
  1250. "were present in a kernel.");
  1251. SubtreeFunctions.insert(F);
  1252. }
  1253. }
  1254. return SubtreeFunctions;
  1255. }
  1256. std::tuple<SetVector<Value *>, SetVector<Function *>, SetVector<const Loop *>,
  1257. isl::space>
  1258. GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) {
  1259. SetVector<Value *> SubtreeValues;
  1260. SetVector<const SCEV *> SCEVs;
  1261. SetVector<const Loop *> Loops;
  1262. isl::space ParamSpace = isl::space(S.getIslCtx(), 0, 0).params();
  1263. SubtreeReferences References = {
  1264. LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator(),
  1265. &ParamSpace};
  1266. for (const auto &I : IDToValue)
  1267. SubtreeValues.insert(I.second);
  1268. // NOTE: this is populated in IslNodeBuilder::addParameters
  1269. // See [Code generation of induction variables of loops outside Scops].
  1270. for (const auto &I : OutsideLoopIterations)
  1271. SubtreeValues.insert(cast<SCEVUnknown>(I.second)->getValue());
  1272. isl_ast_node_foreach_descendant_top_down(
  1273. Kernel->tree, collectReferencesInGPUStmt, &References);
  1274. for (const SCEV *Expr : SCEVs) {
  1275. findValues(Expr, SE, SubtreeValues);
  1276. findLoops(Expr, Loops);
  1277. }
  1278. Loops.remove_if([this](const Loop *L) {
  1279. return S.contains(L) || L->contains(S.getEntry());
  1280. });
  1281. for (auto &SAI : S.arrays())
  1282. SubtreeValues.remove(SAI->getBasePtr());
  1283. isl_space *Space = S.getParamSpace().release();
  1284. for (long i = 0, n = isl_space_dim(Space, isl_dim_param); i < n; i++) {
  1285. isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i);
  1286. assert(IDToValue.count(Id));
  1287. Value *Val = IDToValue[Id];
  1288. SubtreeValues.remove(Val);
  1289. isl_id_free(Id);
  1290. }
  1291. isl_space_free(Space);
  1292. for (long i = 0, n = isl_space_dim(Kernel->space, isl_dim_set); i < n; i++) {
  1293. isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
  1294. assert(IDToValue.count(Id));
  1295. Value *Val = IDToValue[Id];
  1296. SubtreeValues.remove(Val);
  1297. isl_id_free(Id);
  1298. }
  1299. // Note: { ValidSubtreeValues, ValidSubtreeFunctions } partitions
  1300. // SubtreeValues. This is important, because we should not lose any
  1301. // SubtreeValues in the process of constructing the
  1302. // "ValidSubtree{Values, Functions} sets. Nor should the set
  1303. // ValidSubtree{Values, Functions} have any common element.
  1304. auto ValidSubtreeValuesIt =
  1305. make_filter_range(SubtreeValues, isValidSubtreeValue);
  1306. SetVector<Value *> ValidSubtreeValues(ValidSubtreeValuesIt.begin(),
  1307. ValidSubtreeValuesIt.end());
  1308. bool AllowCUDALibDevice = Arch == GPUArch::NVPTX64;
  1309. SetVector<Function *> ValidSubtreeFunctions(
  1310. getFunctionsFromRawSubtreeValues(SubtreeValues, AllowCUDALibDevice));
  1311. // @see IslNodeBuilder::getReferencesInSubtree
  1312. SetVector<Value *> ReplacedValues;
  1313. for (Value *V : ValidSubtreeValues) {
  1314. auto It = ValueMap.find(V);
  1315. if (It == ValueMap.end())
  1316. ReplacedValues.insert(V);
  1317. else
  1318. ReplacedValues.insert(It->second);
  1319. }
  1320. return std::make_tuple(ReplacedValues, ValidSubtreeFunctions, Loops,
  1321. ParamSpace);
  1322. }
  1323. void GPUNodeBuilder::clearDominators(Function *F) {
  1324. DomTreeNode *N = DT.getNode(&F->getEntryBlock());
  1325. std::vector<BasicBlock *> Nodes;
  1326. for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I)
  1327. Nodes.push_back(I->getBlock());
  1328. for (BasicBlock *BB : Nodes)
  1329. DT.eraseNode(BB);
  1330. }
  1331. void GPUNodeBuilder::clearScalarEvolution(Function *F) {
  1332. for (BasicBlock &BB : *F) {
  1333. Loop *L = LI.getLoopFor(&BB);
  1334. if (L)
  1335. SE.forgetLoop(L);
  1336. }
  1337. }
  1338. void GPUNodeBuilder::clearLoops(Function *F) {
  1339. SmallSet<Loop *, 1> WorkList;
  1340. for (BasicBlock &BB : *F) {
  1341. Loop *L = LI.getLoopFor(&BB);
  1342. if (L)
  1343. WorkList.insert(L);
  1344. }
  1345. for (auto *L : WorkList)
  1346. LI.erase(L);
  1347. }
  1348. std::tuple<Value *, Value *> GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) {
  1349. std::vector<Value *> Sizes;
  1350. isl::ast_build Context = isl::ast_build::from_context(S.getContext());
  1351. isl::multi_pw_aff GridSizePwAffs = isl::manage_copy(Kernel->grid_size);
  1352. for (long i = 0; i < Kernel->n_grid; i++) {
  1353. isl::pw_aff Size = GridSizePwAffs.at(i);
  1354. isl::ast_expr GridSize = Context.expr_from(Size);
  1355. Value *Res = ExprBuilder.create(GridSize.release());
  1356. Res = Builder.CreateTrunc(Res, Builder.getInt32Ty());
  1357. Sizes.push_back(Res);
  1358. }
  1359. for (long i = Kernel->n_grid; i < 3; i++)
  1360. Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1));
  1361. return std::make_tuple(Sizes[0], Sizes[1]);
  1362. }
  1363. std::tuple<Value *, Value *, Value *>
  1364. GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) {
  1365. std::vector<Value *> Sizes;
  1366. for (long i = 0; i < Kernel->n_block; i++) {
  1367. Value *Res = ConstantInt::get(Builder.getInt32Ty(), Kernel->block_dim[i]);
  1368. Sizes.push_back(Res);
  1369. }
  1370. for (long i = Kernel->n_block; i < 3; i++)
  1371. Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1));
  1372. return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]);
  1373. }
  1374. void GPUNodeBuilder::insertStoreParameter(Instruction *Parameters,
  1375. Instruction *Param, int Index) {
  1376. Value *Slot = Builder.CreateGEP(
  1377. Parameters->getType()->getPointerElementType(), Parameters,
  1378. {Builder.getInt64(0), Builder.getInt64(Index)});
  1379. Value *ParamTyped = Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
  1380. Builder.CreateStore(ParamTyped, Slot);
  1381. }
  1382. Value *
  1383. GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F,
  1384. SetVector<Value *> SubtreeValues) {
  1385. const int NumArgs = F->arg_size();
  1386. std::vector<int> ArgSizes(NumArgs);
  1387. // If we are using the OpenCL Runtime, we need to add the kernel argument
  1388. // sizes to the end of the launch-parameter list, so OpenCL can determine
  1389. // how big the respective kernel arguments are.
  1390. // Here we need to reserve adequate space for that.
  1391. Type *ArrayTy;
  1392. if (Runtime == GPURuntime::OpenCL)
  1393. ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 2 * NumArgs);
  1394. else
  1395. ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), NumArgs);
  1396. BasicBlock *EntryBlock =
  1397. &Builder.GetInsertBlock()->getParent()->getEntryBlock();
  1398. auto AddressSpace = F->getParent()->getDataLayout().getAllocaAddrSpace();
  1399. std::string Launch = "polly_launch_" + std::to_string(Kernel->id);
  1400. Instruction *Parameters = new AllocaInst(
  1401. ArrayTy, AddressSpace, Launch + "_params", EntryBlock->getTerminator());
  1402. int Index = 0;
  1403. for (long i = 0; i < Prog->n_array; i++) {
  1404. if (!ppcg_kernel_requires_array_argument(Kernel, i))
  1405. continue;
  1406. isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
  1407. const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage(Id));
  1408. if (Runtime == GPURuntime::OpenCL)
  1409. ArgSizes[Index] = SAI->getElemSizeInBytes();
  1410. Value *DevArray = nullptr;
  1411. if (PollyManagedMemory) {
  1412. DevArray = getManagedDeviceArray(&Prog->array[i],
  1413. const_cast<ScopArrayInfo *>(SAI));
  1414. } else {
  1415. DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)];
  1416. DevArray = createCallGetDevicePtr(DevArray);
  1417. }
  1418. assert(DevArray != nullptr && "Array to be offloaded to device not "
  1419. "initialized");
  1420. Value *Offset = getArrayOffset(&Prog->array[i]);
  1421. if (Offset) {
  1422. DevArray = Builder.CreatePointerCast(
  1423. DevArray, SAI->getElementType()->getPointerTo());
  1424. DevArray = Builder.CreateGEP(SAI->getElementType(), DevArray,
  1425. Builder.CreateNeg(Offset));
  1426. DevArray = Builder.CreatePointerCast(DevArray, Builder.getInt8PtrTy());
  1427. }
  1428. Value *Slot = Builder.CreateGEP(
  1429. ArrayTy, Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
  1430. if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
  1431. Value *ValPtr = nullptr;
  1432. if (PollyManagedMemory)
  1433. ValPtr = DevArray;
  1434. else
  1435. ValPtr = BlockGen.getOrCreateAlloca(SAI);
  1436. assert(ValPtr != nullptr && "ValPtr that should point to a valid object"
  1437. " to be stored into Parameters");
  1438. Value *ValPtrCast =
  1439. Builder.CreatePointerCast(ValPtr, Builder.getInt8PtrTy());
  1440. Builder.CreateStore(ValPtrCast, Slot);
  1441. } else {
  1442. Instruction *Param =
  1443. new AllocaInst(Builder.getInt8PtrTy(), AddressSpace,
  1444. Launch + "_param_" + std::to_string(Index),
  1445. EntryBlock->getTerminator());
  1446. Builder.CreateStore(DevArray, Param);
  1447. Value *ParamTyped =
  1448. Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
  1449. Builder.CreateStore(ParamTyped, Slot);
  1450. }
  1451. Index++;
  1452. }
  1453. int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set);
  1454. for (long i = 0; i < NumHostIters; i++) {
  1455. isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
  1456. Value *Val = IDToValue[Id];
  1457. isl_id_free(Id);
  1458. if (Runtime == GPURuntime::OpenCL)
  1459. ArgSizes[Index] = computeSizeInBytes(Val->getType());
  1460. Instruction *Param =
  1461. new AllocaInst(Val->getType(), AddressSpace,
  1462. Launch + "_param_" + std::to_string(Index),
  1463. EntryBlock->getTerminator());
  1464. Builder.CreateStore(Val, Param);
  1465. insertStoreParameter(Parameters, Param, Index);
  1466. Index++;
  1467. }
  1468. int NumVars = isl_space_dim(Kernel->space, isl_dim_param);
  1469. for (long i = 0; i < NumVars; i++) {
  1470. isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i);
  1471. Value *Val = IDToValue[Id];
  1472. if (ValueMap.count(Val))
  1473. Val = ValueMap[Val];
  1474. isl_id_free(Id);
  1475. if (Runtime == GPURuntime::OpenCL)
  1476. ArgSizes[Index] = computeSizeInBytes(Val->getType());
  1477. Instruction *Param =
  1478. new AllocaInst(Val->getType(), AddressSpace,
  1479. Launch + "_param_" + std::to_string(Index),
  1480. EntryBlock->getTerminator());
  1481. Builder.CreateStore(Val, Param);
  1482. insertStoreParameter(Parameters, Param, Index);
  1483. Index++;
  1484. }
  1485. for (auto Val : SubtreeValues) {
  1486. if (Runtime == GPURuntime::OpenCL)
  1487. ArgSizes[Index] = computeSizeInBytes(Val->getType());
  1488. Instruction *Param =
  1489. new AllocaInst(Val->getType(), AddressSpace,
  1490. Launch + "_param_" + std::to_string(Index),
  1491. EntryBlock->getTerminator());
  1492. Builder.CreateStore(Val, Param);
  1493. insertStoreParameter(Parameters, Param, Index);
  1494. Index++;
  1495. }
  1496. if (Runtime == GPURuntime::OpenCL) {
  1497. for (int i = 0; i < NumArgs; i++) {
  1498. Value *Val = ConstantInt::get(Builder.getInt32Ty(), ArgSizes[i]);
  1499. Instruction *Param =
  1500. new AllocaInst(Builder.getInt32Ty(), AddressSpace,
  1501. Launch + "_param_size_" + std::to_string(i),
  1502. EntryBlock->getTerminator());
  1503. Builder.CreateStore(Val, Param);
  1504. insertStoreParameter(Parameters, Param, Index);
  1505. Index++;
  1506. }
  1507. }
  1508. auto Location = EntryBlock->getTerminator();
  1509. return new BitCastInst(Parameters, Builder.getInt8PtrTy(),
  1510. Launch + "_params_i8ptr", Location);
  1511. }
  1512. void GPUNodeBuilder::setupKernelSubtreeFunctions(
  1513. SetVector<Function *> SubtreeFunctions) {
  1514. for (auto Fn : SubtreeFunctions) {
  1515. const std::string ClonedFnName = Fn->getName().str();
  1516. Function *Clone = GPUModule->getFunction(ClonedFnName);
  1517. if (!Clone)
  1518. Clone =
  1519. Function::Create(Fn->getFunctionType(), GlobalValue::ExternalLinkage,
  1520. ClonedFnName, GPUModule.get());
  1521. assert(Clone && "Expected cloned function to be initialized.");
  1522. assert(ValueMap.find(Fn) == ValueMap.end() &&
  1523. "Fn already present in ValueMap");
  1524. ValueMap[Fn] = Clone;
  1525. }
  1526. }
  1527. void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
  1528. isl_id *Id = isl_ast_node_get_annotation(KernelStmt);
  1529. ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id);
  1530. isl_id_free(Id);
  1531. isl_ast_node_free(KernelStmt);
  1532. if (Kernel->n_grid > 1)
  1533. DeepestParallel = std::max(
  1534. DeepestParallel, (unsigned)isl_space_dim(Kernel->space, isl_dim_set));
  1535. else
  1536. DeepestSequential = std::max(
  1537. DeepestSequential, (unsigned)isl_space_dim(Kernel->space, isl_dim_set));
  1538. Value *BlockDimX, *BlockDimY, *BlockDimZ;
  1539. std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel);
  1540. SetVector<Value *> SubtreeValues;
  1541. SetVector<Function *> SubtreeFunctions;
  1542. SetVector<const Loop *> Loops;
  1543. isl::space ParamSpace;
  1544. std::tie(SubtreeValues, SubtreeFunctions, Loops, ParamSpace) =
  1545. getReferencesInKernel(Kernel);
  1546. // Add parameters that appear only in the access function to the kernel
  1547. // space. This is important to make sure that all isl_ids are passed as
  1548. // parameters to the kernel, even though we may not have all parameters
  1549. // in the context to improve compile time.
  1550. Kernel->space = isl_space_align_params(Kernel->space, ParamSpace.release());
  1551. assert(Kernel->tree && "Device AST of kernel node is empty");
  1552. Instruction &HostInsertPoint = *Builder.GetInsertPoint();
  1553. IslExprBuilder::IDToValueTy HostIDs = IDToValue;
  1554. ValueMapT HostValueMap = ValueMap;
  1555. BlockGenerator::AllocaMapTy HostScalarMap = ScalarMap;
  1556. ScalarMap.clear();
  1557. BlockGenerator::EscapeUsersAllocaMapTy HostEscapeMap = EscapeMap;
  1558. EscapeMap.clear();
  1559. // Create for all loops we depend on values that contain the current loop
  1560. // iteration. These values are necessary to generate code for SCEVs that
  1561. // depend on such loops. As a result we need to pass them to the subfunction.
  1562. for (const Loop *L : Loops) {
  1563. const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)),
  1564. SE.getUnknown(Builder.getInt64(1)),
  1565. L, SCEV::FlagAnyWrap);
  1566. Value *V = generateSCEV(OuterLIV);
  1567. OutsideLoopIterations[L] = SE.getUnknown(V);
  1568. SubtreeValues.insert(V);
  1569. }
  1570. createKernelFunction(Kernel, SubtreeValues, SubtreeFunctions);
  1571. setupKernelSubtreeFunctions(SubtreeFunctions);
  1572. create(isl_ast_node_copy(Kernel->tree));
  1573. finalizeKernelArguments(Kernel);
  1574. Function *F = Builder.GetInsertBlock()->getParent();
  1575. if (Arch == GPUArch::NVPTX64)
  1576. addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ);
  1577. clearDominators(F);
  1578. clearScalarEvolution(F);
  1579. clearLoops(F);
  1580. IDToValue = HostIDs;
  1581. ValueMap = std::move(HostValueMap);
  1582. ScalarMap = std::move(HostScalarMap);
  1583. EscapeMap = std::move(HostEscapeMap);
  1584. IDToSAI.clear();
  1585. Annotator.resetAlternativeAliasBases();
  1586. for (auto &BasePtr : LocalArrays)
  1587. S.invalidateScopArrayInfo(BasePtr, MemoryKind::Array);
  1588. LocalArrays.clear();
  1589. std::string ASMString = finalizeKernelFunction();
  1590. Builder.SetInsertPoint(&HostInsertPoint);
  1591. Value *Parameters = createLaunchParameters(Kernel, F, SubtreeValues);
  1592. std::string Name = getKernelFuncName(Kernel->id);
  1593. Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name);
  1594. Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name");
  1595. Value *GPUKernel = createCallGetKernel(KernelString, NameString);
  1596. Value *GridDimX, *GridDimY;
  1597. std::tie(GridDimX, GridDimY) = getGridSizes(Kernel);
  1598. createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
  1599. BlockDimZ, Parameters);
  1600. createCallFreeKernel(GPUKernel);
  1601. for (auto Id : KernelIds)
  1602. isl_id_free(Id);
  1603. KernelIds.clear();
  1604. }
  1605. /// Compute the DataLayout string for the NVPTX backend.
  1606. ///
  1607. /// @param is64Bit Are we looking for a 64 bit architecture?
  1608. static std::string computeNVPTXDataLayout(bool is64Bit) {
  1609. std::string Ret = "";
  1610. if (!is64Bit) {
  1611. Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
  1612. "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:"
  1613. "64-v128:128:128-n16:32:64";
  1614. } else {
  1615. Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
  1616. "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:"
  1617. "64-v128:128:128-n16:32:64";
  1618. }
  1619. return Ret;
  1620. }
  1621. /// Compute the DataLayout string for a SPIR kernel.
  1622. ///
  1623. /// @param is64Bit Are we looking for a 64 bit architecture?
  1624. static std::string computeSPIRDataLayout(bool is64Bit) {
  1625. std::string Ret = "";
  1626. if (!is64Bit) {
  1627. Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
  1628. "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:"
  1629. "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:"
  1630. "256:256-v256:256:256-v512:512:512-v1024:1024:1024";
  1631. } else {
  1632. Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
  1633. "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:"
  1634. "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:"
  1635. "256:256-v256:256:256-v512:512:512-v1024:1024:1024";
  1636. }
  1637. return Ret;
  1638. }
  1639. Function *
  1640. GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
  1641. SetVector<Value *> &SubtreeValues) {
  1642. std::vector<Type *> Args;
  1643. std::string Identifier = getKernelFuncName(Kernel->id);
  1644. std::vector<Metadata *> MemoryType;
  1645. for (long i = 0; i < Prog->n_array; i++) {
  1646. if (!ppcg_kernel_requires_array_argument(Kernel, i))
  1647. continue;
  1648. if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
  1649. isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
  1650. const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage(Id));
  1651. Args.push_back(SAI->getElementType());
  1652. MemoryType.push_back(
  1653. ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
  1654. } else {
  1655. static const int UseGlobalMemory = 1;
  1656. Args.push_back(Builder.getInt8PtrTy(UseGlobalMemory));
  1657. MemoryType.push_back(
  1658. ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 1)));
  1659. }
  1660. }
  1661. int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set);
  1662. for (long i = 0; i < NumHostIters; i++) {
  1663. Args.push_back(Builder.getInt64Ty());
  1664. MemoryType.push_back(
  1665. ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
  1666. }
  1667. int NumVars = isl_space_dim(Kernel->space, isl_dim_param);
  1668. for (long i = 0; i < NumVars; i++) {
  1669. isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i);
  1670. Value *Val = IDToValue[Id];
  1671. isl_id_free(Id);
  1672. Args.push_back(Val->getType());
  1673. MemoryType.push_back(
  1674. ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
  1675. }
  1676. for (auto *V : SubtreeValues) {
  1677. Args.push_back(V->getType());
  1678. MemoryType.push_back(
  1679. ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
  1680. }
  1681. auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
  1682. auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
  1683. GPUModule.get());
  1684. std::vector<Metadata *> EmptyStrings;
  1685. for (unsigned int i = 0; i < MemoryType.size(); i++) {
  1686. EmptyStrings.push_back(MDString::get(FN->getContext(), ""));
  1687. }
  1688. if (Arch == GPUArch::SPIR32 || Arch == GPUArch::SPIR64) {
  1689. FN->setMetadata("kernel_arg_addr_space",
  1690. MDNode::get(FN->getContext(), MemoryType));
  1691. FN->setMetadata("kernel_arg_name",
  1692. MDNode::get(FN->getContext(), EmptyStrings));
  1693. FN->setMetadata("kernel_arg_access_qual",
  1694. MDNode::get(FN->getContext(), EmptyStrings));
  1695. FN->setMetadata("kernel_arg_type",
  1696. MDNode::get(FN->getContext(), EmptyStrings));
  1697. FN->setMetadata("kernel_arg_type_qual",
  1698. MDNode::get(FN->getContext(), EmptyStrings));
  1699. FN->setMetadata("kernel_arg_base_type",
  1700. MDNode::get(FN->getContext(), EmptyStrings));
  1701. }
  1702. switch (Arch) {
  1703. case GPUArch::NVPTX64:
  1704. FN->setCallingConv(CallingConv::PTX_Kernel);
  1705. break;
  1706. case GPUArch::SPIR32:
  1707. case GPUArch::SPIR64:
  1708. FN->setCallingConv(CallingConv::SPIR_KERNEL);
  1709. break;
  1710. }
  1711. auto Arg = FN->arg_begin();
  1712. for (long i = 0; i < Kernel->n_array; i++) {
  1713. if (!ppcg_kernel_requires_array_argument(Kernel, i))
  1714. continue;
  1715. Arg->setName(Kernel->array[i].array->name);
  1716. isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
  1717. const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage_copy(Id));
  1718. Type *EleTy = SAI->getElementType();
  1719. Value *Val = &*Arg;
  1720. SmallVector<const SCEV *, 4> Sizes;
  1721. isl_ast_build *Build =
  1722. isl_ast_build_from_context(isl_set_copy(Prog->context));
  1723. Sizes.push_back(nullptr);
  1724. for (long j = 1, n = Kernel->array[i].array->n_index; j < n; j++) {
  1725. isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff(
  1726. Build, isl_multi_pw_aff_get_pw_aff(Kernel->array[i].array->bound, j));
  1727. auto V = ExprBuilder.create(DimSize);
  1728. Sizes.push_back(SE.getSCEV(V));
  1729. }
  1730. const ScopArrayInfo *SAIRep =
  1731. S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, MemoryKind::Array);
  1732. LocalArrays.push_back(Val);
  1733. isl_ast_build_free(Build);
  1734. KernelIds.push_back(Id);
  1735. IDToSAI[Id] = SAIRep;
  1736. Arg++;
  1737. }
  1738. for (long i = 0; i < NumHostIters; i++) {
  1739. isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
  1740. Arg->setName(isl_id_get_name(Id));
  1741. IDToValue[Id] = &*Arg;
  1742. KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
  1743. Arg++;
  1744. }
  1745. for (long i = 0; i < NumVars; i++) {
  1746. isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i);
  1747. Arg->setName(isl_id_get_name(Id));
  1748. Value *Val = IDToValue[Id];
  1749. ValueMap[Val] = &*Arg;
  1750. IDToValue[Id] = &*Arg;
  1751. KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
  1752. Arg++;
  1753. }
  1754. for (auto *V : SubtreeValues) {
  1755. Arg->setName(V->getName());
  1756. ValueMap[V] = &*Arg;
  1757. Arg++;
  1758. }
  1759. return FN;
  1760. }
  1761. void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) {
  1762. Intrinsic::ID IntrinsicsBID[2];
  1763. Intrinsic::ID IntrinsicsTID[3];
  1764. switch (Arch) {
  1765. case GPUArch::SPIR64:
  1766. case GPUArch::SPIR32:
  1767. llvm_unreachable("Cannot generate NVVM intrinsics for SPIR");
  1768. case GPUArch::NVPTX64:
  1769. IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
  1770. IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y;
  1771. IntrinsicsTID[0] = Intrinsic::nvvm_read_ptx_sreg_tid_x;
  1772. IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y;
  1773. IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z;
  1774. break;
  1775. }
  1776. auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable {
  1777. std::string Name = isl_id_get_name(Id);
  1778. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  1779. Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr);
  1780. Value *Val = Builder.CreateCall(IntrinsicFn, {});
  1781. Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name);
  1782. IDToValue[Id] = Val;
  1783. KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
  1784. };
  1785. for (int i = 0; i < Kernel->n_grid; ++i) {
  1786. isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i);
  1787. addId(Id, IntrinsicsBID[i]);
  1788. }
  1789. for (int i = 0; i < Kernel->n_block; ++i) {
  1790. isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i);
  1791. addId(Id, IntrinsicsTID[i]);
  1792. }
  1793. }
  1794. void GPUNodeBuilder::insertKernelCallsSPIR(ppcg_kernel *Kernel,
  1795. bool SizeTypeIs64bit) {
  1796. const char *GroupName[3] = {"__gen_ocl_get_group_id0",
  1797. "__gen_ocl_get_group_id1",
  1798. "__gen_ocl_get_group_id2"};
  1799. const char *LocalName[3] = {"__gen_ocl_get_local_id0",
  1800. "__gen_ocl_get_local_id1",
  1801. "__gen_ocl_get_local_id2"};
  1802. IntegerType *SizeT =
  1803. SizeTypeIs64bit ? Builder.getInt64Ty() : Builder.getInt32Ty();
  1804. auto createFunc = [this](const char *Name, __isl_take isl_id *Id,
  1805. IntegerType *SizeT) mutable {
  1806. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  1807. Function *FN = M->getFunction(Name);
  1808. // If FN is not available, declare it.
  1809. if (!FN) {
  1810. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  1811. std::vector<Type *> Args;
  1812. FunctionType *Ty = FunctionType::get(SizeT, Args, false);
  1813. FN = Function::Create(Ty, Linkage, Name, M);
  1814. FN->setCallingConv(CallingConv::SPIR_FUNC);
  1815. }
  1816. Value *Val = Builder.CreateCall(FN, {});
  1817. if (SizeT == Builder.getInt32Ty())
  1818. Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name);
  1819. IDToValue[Id] = Val;
  1820. KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
  1821. };
  1822. for (int i = 0; i < Kernel->n_grid; ++i)
  1823. createFunc(GroupName[i], isl_id_list_get_id(Kernel->block_ids, i), SizeT);
  1824. for (int i = 0; i < Kernel->n_block; ++i)
  1825. createFunc(LocalName[i], isl_id_list_get_id(Kernel->thread_ids, i), SizeT);
  1826. }
  1827. void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) {
  1828. auto Arg = FN->arg_begin();
  1829. for (long i = 0; i < Kernel->n_array; i++) {
  1830. if (!ppcg_kernel_requires_array_argument(Kernel, i))
  1831. continue;
  1832. isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
  1833. const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage_copy(Id));
  1834. isl_id_free(Id);
  1835. if (SAI->getNumberOfDimensions() > 0) {
  1836. Arg++;
  1837. continue;
  1838. }
  1839. Value *Val = &*Arg;
  1840. if (!gpu_array_is_read_only_scalar(&Prog->array[i])) {
  1841. Type *TypePtr = SAI->getElementType()->getPointerTo();
  1842. Value *TypedArgPtr = Builder.CreatePointerCast(Val, TypePtr);
  1843. Val = Builder.CreateLoad(SAI->getElementType(), TypedArgPtr);
  1844. }
  1845. Value *Alloca = BlockGen.getOrCreateAlloca(SAI);
  1846. Builder.CreateStore(Val, Alloca);
  1847. Arg++;
  1848. }
  1849. }
  1850. void GPUNodeBuilder::finalizeKernelArguments(ppcg_kernel *Kernel) {
  1851. auto *FN = Builder.GetInsertBlock()->getParent();
  1852. auto Arg = FN->arg_begin();
  1853. bool StoredScalar = false;
  1854. for (long i = 0; i < Kernel->n_array; i++) {
  1855. if (!ppcg_kernel_requires_array_argument(Kernel, i))
  1856. continue;
  1857. isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
  1858. const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage_copy(Id));
  1859. isl_id_free(Id);
  1860. if (SAI->getNumberOfDimensions() > 0) {
  1861. Arg++;
  1862. continue;
  1863. }
  1864. if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
  1865. Arg++;
  1866. continue;
  1867. }
  1868. Value *Alloca = BlockGen.getOrCreateAlloca(SAI);
  1869. Value *ArgPtr = &*Arg;
  1870. Type *TypePtr = SAI->getElementType()->getPointerTo();
  1871. Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr);
  1872. Value *Val = Builder.CreateLoad(SAI->getElementType(), Alloca);
  1873. Builder.CreateStore(Val, TypedArgPtr);
  1874. StoredScalar = true;
  1875. Arg++;
  1876. }
  1877. if (StoredScalar) {
  1878. /// In case more than one thread contains scalar stores, the generated
  1879. /// code might be incorrect, if we only store at the end of the kernel.
  1880. /// To support this case we need to store these scalars back at each
  1881. /// memory store or at least before each kernel barrier.
  1882. if (Kernel->n_block != 0 || Kernel->n_grid != 0) {
  1883. BuildSuccessful = 0;
  1884. LLVM_DEBUG(
  1885. dbgs() << getUniqueScopName(&S)
  1886. << " has a store to a scalar value that"
  1887. " would be undefined to run in parallel. Bailing out.\n";);
  1888. }
  1889. }
  1890. }
  1891. void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) {
  1892. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  1893. for (int i = 0; i < Kernel->n_var; ++i) {
  1894. struct ppcg_kernel_var &Var = Kernel->var[i];
  1895. isl_id *Id = isl_space_get_tuple_id(Var.array->space, isl_dim_set);
  1896. Type *EleTy = ScopArrayInfo::getFromId(isl::manage(Id))->getElementType();
  1897. Type *ArrayTy = EleTy;
  1898. SmallVector<const SCEV *, 4> Sizes;
  1899. Sizes.push_back(nullptr);
  1900. for (unsigned int j = 1; j < Var.array->n_index; ++j) {
  1901. isl_val *Val = isl_vec_get_element_val(Var.size, j);
  1902. long Bound = isl_val_get_num_si(Val);
  1903. isl_val_free(Val);
  1904. Sizes.push_back(S.getSE()->getConstant(Builder.getInt64Ty(), Bound));
  1905. }
  1906. for (int j = Var.array->n_index - 1; j >= 0; --j) {
  1907. isl_val *Val = isl_vec_get_element_val(Var.size, j);
  1908. long Bound = isl_val_get_num_si(Val);
  1909. isl_val_free(Val);
  1910. ArrayTy = ArrayType::get(ArrayTy, Bound);
  1911. }
  1912. const ScopArrayInfo *SAI;
  1913. Value *Allocation;
  1914. if (Var.type == ppcg_access_shared) {
  1915. auto GlobalVar = new GlobalVariable(
  1916. *M, ArrayTy, false, GlobalValue::InternalLinkage, 0, Var.name,
  1917. nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 3);
  1918. GlobalVar->setAlignment(llvm::Align(EleTy->getPrimitiveSizeInBits() / 8));
  1919. GlobalVar->setInitializer(Constant::getNullValue(ArrayTy));
  1920. Allocation = GlobalVar;
  1921. } else if (Var.type == ppcg_access_private) {
  1922. Allocation = Builder.CreateAlloca(ArrayTy, 0, "private_array");
  1923. } else {
  1924. llvm_unreachable("unknown variable type");
  1925. }
  1926. SAI =
  1927. S.getOrCreateScopArrayInfo(Allocation, EleTy, Sizes, MemoryKind::Array);
  1928. Id = isl_id_alloc(S.getIslCtx().get(), Var.name, nullptr);
  1929. IDToValue[Id] = Allocation;
  1930. LocalArrays.push_back(Allocation);
  1931. KernelIds.push_back(Id);
  1932. IDToSAI[Id] = SAI;
  1933. }
  1934. }
  1935. void GPUNodeBuilder::createKernelFunction(
  1936. ppcg_kernel *Kernel, SetVector<Value *> &SubtreeValues,
  1937. SetVector<Function *> &SubtreeFunctions) {
  1938. std::string Identifier = getKernelFuncName(Kernel->id);
  1939. GPUModule.reset(new Module(Identifier, Builder.getContext()));
  1940. switch (Arch) {
  1941. case GPUArch::NVPTX64:
  1942. if (Runtime == GPURuntime::CUDA)
  1943. GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
  1944. else if (Runtime == GPURuntime::OpenCL)
  1945. GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl"));
  1946. GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
  1947. break;
  1948. case GPUArch::SPIR32:
  1949. GPUModule->setTargetTriple(Triple::normalize("spir-unknown-unknown"));
  1950. GPUModule->setDataLayout(computeSPIRDataLayout(false /* is64Bit */));
  1951. break;
  1952. case GPUArch::SPIR64:
  1953. GPUModule->setTargetTriple(Triple::normalize("spir64-unknown-unknown"));
  1954. GPUModule->setDataLayout(computeSPIRDataLayout(true /* is64Bit */));
  1955. break;
  1956. }
  1957. Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues);
  1958. BasicBlock *PrevBlock = Builder.GetInsertBlock();
  1959. auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN);
  1960. DT.addNewBlock(EntryBlock, PrevBlock);
  1961. Builder.SetInsertPoint(EntryBlock);
  1962. Builder.CreateRetVoid();
  1963. Builder.SetInsertPoint(EntryBlock, EntryBlock->begin());
  1964. ScopDetection::markFunctionAsInvalid(FN);
  1965. prepareKernelArguments(Kernel, FN);
  1966. createKernelVariables(Kernel, FN);
  1967. switch (Arch) {
  1968. case GPUArch::NVPTX64:
  1969. insertKernelIntrinsics(Kernel);
  1970. break;
  1971. case GPUArch::SPIR32:
  1972. insertKernelCallsSPIR(Kernel, false);
  1973. break;
  1974. case GPUArch::SPIR64:
  1975. insertKernelCallsSPIR(Kernel, true);
  1976. break;
  1977. }
  1978. }
  1979. std::string GPUNodeBuilder::createKernelASM() {
  1980. llvm::Triple GPUTriple;
  1981. switch (Arch) {
  1982. case GPUArch::NVPTX64:
  1983. switch (Runtime) {
  1984. case GPURuntime::CUDA:
  1985. GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda"));
  1986. break;
  1987. case GPURuntime::OpenCL:
  1988. GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl"));
  1989. break;
  1990. }
  1991. break;
  1992. case GPUArch::SPIR64:
  1993. case GPUArch::SPIR32:
  1994. std::string SPIRAssembly;
  1995. raw_string_ostream IROstream(SPIRAssembly);
  1996. IROstream << *GPUModule;
  1997. IROstream.flush();
  1998. return SPIRAssembly;
  1999. }
  2000. std::string ErrMsg;
  2001. auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg);
  2002. if (!GPUTarget) {
  2003. errs() << ErrMsg << "\n";
  2004. return "";
  2005. }
  2006. TargetOptions Options;
  2007. Options.UnsafeFPMath = FastMath;
  2008. std::string subtarget;
  2009. switch (Arch) {
  2010. case GPUArch::NVPTX64:
  2011. subtarget = CudaVersion;
  2012. break;
  2013. case GPUArch::SPIR32:
  2014. case GPUArch::SPIR64:
  2015. llvm_unreachable("No subtarget for SPIR architecture");
  2016. }
  2017. std::unique_ptr<TargetMachine> TargetM(GPUTarget->createTargetMachine(
  2018. GPUTriple.getTriple(), subtarget, "", Options, Optional<Reloc::Model>()));
  2019. SmallString<0> ASMString;
  2020. raw_svector_ostream ASMStream(ASMString);
  2021. llvm::legacy::PassManager PM;
  2022. PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis()));
  2023. if (TargetM->addPassesToEmitFile(PM, ASMStream, nullptr, CGFT_AssemblyFile,
  2024. true /* verify */)) {
  2025. errs() << "The target does not support generation of this file type!\n";
  2026. return "";
  2027. }
  2028. PM.run(*GPUModule);
  2029. return ASMStream.str().str();
  2030. }
  2031. bool GPUNodeBuilder::requiresCUDALibDevice() {
  2032. bool RequiresLibDevice = false;
  2033. for (Function &F : GPUModule->functions()) {
  2034. if (!F.isDeclaration())
  2035. continue;
  2036. const std::string CUDALibDeviceFunc = getCUDALibDeviceFuntion(F.getName());
  2037. if (CUDALibDeviceFunc.length() != 0) {
  2038. // We need to handle the case where a module looks like this:
  2039. // @expf(..)
  2040. // @llvm.exp.f64(..)
  2041. // Both of these functions would be renamed to `__nv_expf`.
  2042. //
  2043. // So, we must first check for the existence of the libdevice function.
  2044. // If this exists, we replace our current function with it.
  2045. //
  2046. // If it does not exist, we rename the current function to the
  2047. // libdevice functiono name.
  2048. if (Function *Replacement = F.getParent()->getFunction(CUDALibDeviceFunc))
  2049. F.replaceAllUsesWith(Replacement);
  2050. else
  2051. F.setName(CUDALibDeviceFunc);
  2052. RequiresLibDevice = true;
  2053. }
  2054. }
  2055. return RequiresLibDevice;
  2056. }
  2057. void GPUNodeBuilder::addCUDALibDevice() {
  2058. if (Arch != GPUArch::NVPTX64)
  2059. return;
  2060. if (requiresCUDALibDevice()) {
  2061. SMDiagnostic Error;
  2062. errs() << CUDALibDevice << "\n";
  2063. auto LibDeviceModule =
  2064. parseIRFile(CUDALibDevice, Error, GPUModule->getContext());
  2065. if (!LibDeviceModule) {
  2066. BuildSuccessful = false;
  2067. report_fatal_error("Could not find or load libdevice. Skipping GPU "
  2068. "kernel generation. Please set -polly-acc-libdevice "
  2069. "accordingly.\n");
  2070. return;
  2071. }
  2072. Linker L(*GPUModule);
  2073. // Set an nvptx64 target triple to avoid linker warnings. The original
  2074. // triple of the libdevice files are nvptx-unknown-unknown.
  2075. LibDeviceModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
  2076. L.linkInModule(std::move(LibDeviceModule), Linker::LinkOnlyNeeded);
  2077. }
  2078. }
  2079. std::string GPUNodeBuilder::finalizeKernelFunction() {
  2080. if (verifyModule(*GPUModule)) {
  2081. LLVM_DEBUG(dbgs() << "verifyModule failed on module:\n";
  2082. GPUModule->print(dbgs(), nullptr); dbgs() << "\n";);
  2083. LLVM_DEBUG(dbgs() << "verifyModule Error:\n";
  2084. verifyModule(*GPUModule, &dbgs()););
  2085. if (FailOnVerifyModuleFailure)
  2086. llvm_unreachable("VerifyModule failed.");
  2087. BuildSuccessful = false;
  2088. return "";
  2089. }
  2090. addCUDALibDevice();
  2091. if (DumpKernelIR)
  2092. outs() << *GPUModule << "\n";
  2093. if (Arch != GPUArch::SPIR32 && Arch != GPUArch::SPIR64) {
  2094. // Optimize module.
  2095. llvm::legacy::PassManager OptPasses;
  2096. PassManagerBuilder PassBuilder;
  2097. PassBuilder.OptLevel = 3;
  2098. PassBuilder.SizeLevel = 0;
  2099. PassBuilder.populateModulePassManager(OptPasses);
  2100. OptPasses.run(*GPUModule);
  2101. }
  2102. std::string Assembly = createKernelASM();
  2103. if (DumpKernelASM)
  2104. outs() << Assembly << "\n";
  2105. GPUModule.release();
  2106. KernelIDs.clear();
  2107. return Assembly;
  2108. }
  2109. /// Construct an `isl_pw_aff_list` from a vector of `isl_pw_aff`
  2110. /// @param PwAffs The list of piecewise affine functions to create an
  2111. /// `isl_pw_aff_list` from. We expect an rvalue ref because
  2112. /// all the isl_pw_aff are used up by this function.
  2113. ///
  2114. /// @returns The `isl_pw_aff_list`.
  2115. __isl_give isl_pw_aff_list *
  2116. createPwAffList(isl_ctx *Context,
  2117. const std::vector<__isl_take isl_pw_aff *> &&PwAffs) {
  2118. isl_pw_aff_list *List = isl_pw_aff_list_alloc(Context, PwAffs.size());
  2119. for (unsigned i = 0; i < PwAffs.size(); i++) {
  2120. List = isl_pw_aff_list_insert(List, i, PwAffs[i]);
  2121. }
  2122. return List;
  2123. }
  2124. /// Align all the `PwAffs` such that they have the same parameter dimensions.
  2125. ///
  2126. /// We loop over all `pw_aff` and align all of their spaces together to
  2127. /// create a common space for all the `pw_aff`. This common space is the
  2128. /// `AlignSpace`. We then align all the `pw_aff` to this space. We start
  2129. /// with the given `SeedSpace`.
  2130. /// @param PwAffs The list of piecewise affine functions we want to align.
  2131. /// This is an rvalue reference because the entire vector is
  2132. /// used up by the end of the operation.
  2133. /// @param SeedSpace The space to start the alignment process with.
  2134. /// @returns A std::pair, whose first element is the aligned space,
  2135. /// whose second element is the vector of aligned piecewise
  2136. /// affines.
  2137. static std::pair<__isl_give isl_space *, std::vector<__isl_give isl_pw_aff *>>
  2138. alignPwAffs(const std::vector<__isl_take isl_pw_aff *> &&PwAffs,
  2139. __isl_take isl_space *SeedSpace) {
  2140. assert(SeedSpace && "Invalid seed space given.");
  2141. isl_space *AlignSpace = SeedSpace;
  2142. for (isl_pw_aff *PwAff : PwAffs) {
  2143. isl_space *PwAffSpace = isl_pw_aff_get_domain_space(PwAff);
  2144. AlignSpace = isl_space_align_params(AlignSpace, PwAffSpace);
  2145. }
  2146. std::vector<isl_pw_aff *> AdjustedPwAffs;
  2147. for (unsigned i = 0; i < PwAffs.size(); i++) {
  2148. isl_pw_aff *Adjusted = PwAffs[i];
  2149. assert(Adjusted && "Invalid pw_aff given.");
  2150. Adjusted = isl_pw_aff_align_params(Adjusted, isl_space_copy(AlignSpace));
  2151. AdjustedPwAffs.push_back(Adjusted);
  2152. }
  2153. return std::make_pair(AlignSpace, AdjustedPwAffs);
  2154. }
  2155. namespace {
  2156. class PPCGCodeGeneration : public ScopPass {
  2157. public:
  2158. static char ID;
  2159. GPURuntime Runtime = GPURuntime::CUDA;
  2160. GPUArch Architecture = GPUArch::NVPTX64;
  2161. /// The scop that is currently processed.
  2162. Scop *S;
  2163. LoopInfo *LI;
  2164. DominatorTree *DT;
  2165. ScalarEvolution *SE;
  2166. const DataLayout *DL;
  2167. RegionInfo *RI;
  2168. PPCGCodeGeneration() : ScopPass(ID) {
  2169. // Apply defaults.
  2170. Runtime = GPURuntimeChoice;
  2171. Architecture = GPUArchChoice;
  2172. }
  2173. /// Construct compilation options for PPCG.
  2174. ///
  2175. /// @returns The compilation options.
  2176. ppcg_options *createPPCGOptions() {
  2177. auto DebugOptions =
  2178. (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options));
  2179. auto Options = (ppcg_options *)malloc(sizeof(ppcg_options));
  2180. DebugOptions->dump_schedule_constraints = false;
  2181. DebugOptions->dump_schedule = false;
  2182. DebugOptions->dump_final_schedule = false;
  2183. DebugOptions->dump_sizes = false;
  2184. DebugOptions->verbose = false;
  2185. Options->debug = DebugOptions;
  2186. Options->group_chains = false;
  2187. Options->reschedule = true;
  2188. Options->scale_tile_loops = false;
  2189. Options->wrap = false;
  2190. Options->non_negative_parameters = false;
  2191. Options->ctx = nullptr;
  2192. Options->sizes = nullptr;
  2193. Options->tile = true;
  2194. Options->tile_size = 32;
  2195. Options->isolate_full_tiles = false;
  2196. Options->use_private_memory = PrivateMemory;
  2197. Options->use_shared_memory = SharedMemory;
  2198. Options->max_shared_memory = 48 * 1024;
  2199. Options->target = PPCG_TARGET_CUDA;
  2200. Options->openmp = false;
  2201. Options->linearize_device_arrays = true;
  2202. Options->allow_gnu_extensions = false;
  2203. Options->unroll_copy_shared = false;
  2204. Options->unroll_gpu_tile = false;
  2205. Options->live_range_reordering = true;
  2206. Options->live_range_reordering = true;
  2207. Options->hybrid = false;
  2208. Options->opencl_compiler_options = nullptr;
  2209. Options->opencl_use_gpu = false;
  2210. Options->opencl_n_include_file = 0;
  2211. Options->opencl_include_files = nullptr;
  2212. Options->opencl_print_kernel_types = false;
  2213. Options->opencl_embed_kernel_code = false;
  2214. Options->save_schedule_file = nullptr;
  2215. Options->load_schedule_file = nullptr;
  2216. return Options;
  2217. }
  2218. /// Get a tagged access relation containing all accesses of type @p AccessTy.
  2219. ///
  2220. /// Instead of a normal access of the form:
  2221. ///
  2222. /// Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)]
  2223. ///
  2224. /// a tagged access has the form
  2225. ///
  2226. /// [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)]
  2227. ///
  2228. /// where 'id' is an additional space that references the memory access that
  2229. /// triggered the access.
  2230. ///
  2231. /// @param AccessTy The type of the memory accesses to collect.
  2232. ///
  2233. /// @return The relation describing all tagged memory accesses.
  2234. isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) {
  2235. isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace().release());
  2236. for (auto &Stmt : *S)
  2237. for (auto &Acc : Stmt)
  2238. if (Acc->getType() == AccessTy) {
  2239. isl_map *Relation = Acc->getAccessRelation().release();
  2240. Relation =
  2241. isl_map_intersect_domain(Relation, Stmt.getDomain().release());
  2242. isl_space *Space = isl_map_get_space(Relation);
  2243. Space = isl_space_range(Space);
  2244. Space = isl_space_from_range(Space);
  2245. Space =
  2246. isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId().release());
  2247. isl_map *Universe = isl_map_universe(Space);
  2248. Relation = isl_map_domain_product(Relation, Universe);
  2249. Accesses = isl_union_map_add_map(Accesses, Relation);
  2250. }
  2251. return Accesses;
  2252. }
  2253. /// Get the set of all read accesses, tagged with the access id.
  2254. ///
  2255. /// @see getTaggedAccesses
  2256. isl_union_map *getTaggedReads() {
  2257. return getTaggedAccesses(MemoryAccess::READ);
  2258. }
  2259. /// Get the set of all may (and must) accesses, tagged with the access id.
  2260. ///
  2261. /// @see getTaggedAccesses
  2262. isl_union_map *getTaggedMayWrites() {
  2263. return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE),
  2264. getTaggedAccesses(MemoryAccess::MUST_WRITE));
  2265. }
  2266. /// Get the set of all must accesses, tagged with the access id.
  2267. ///
  2268. /// @see getTaggedAccesses
  2269. isl_union_map *getTaggedMustWrites() {
  2270. return getTaggedAccesses(MemoryAccess::MUST_WRITE);
  2271. }
  2272. /// Collect parameter and array names as isl_ids.
  2273. ///
  2274. /// To reason about the different parameters and arrays used, ppcg requires
  2275. /// a list of all isl_ids in use. As PPCG traditionally performs
  2276. /// source-to-source compilation each of these isl_ids is mapped to the
  2277. /// expression that represents it. As we do not have a corresponding
  2278. /// expression in Polly, we just map each id to a 'zero' expression to match
  2279. /// the data format that ppcg expects.
  2280. ///
  2281. /// @returns Retun a map from collected ids to 'zero' ast expressions.
  2282. __isl_give isl_id_to_ast_expr *getNames() {
  2283. auto *Names = isl_id_to_ast_expr_alloc(
  2284. S->getIslCtx().get(),
  2285. S->getNumParams() + std::distance(S->array_begin(), S->array_end()));
  2286. auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx().get()));
  2287. for (const SCEV *P : S->parameters()) {
  2288. isl_id *Id = S->getIdForParam(P).release();
  2289. Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero));
  2290. }
  2291. for (auto &Array : S->arrays()) {
  2292. auto Id = Array->getBasePtrId().release();
  2293. Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero));
  2294. }
  2295. isl_ast_expr_free(Zero);
  2296. return Names;
  2297. }
  2298. /// Create a new PPCG scop from the current scop.
  2299. ///
  2300. /// The PPCG scop is initialized with data from the current polly::Scop. From
  2301. /// this initial data, the data-dependences in the PPCG scop are initialized.
  2302. /// We do not use Polly's dependence analysis for now, to ensure we match
  2303. /// the PPCG default behaviour more closely.
  2304. ///
  2305. /// @returns A new ppcg scop.
  2306. ppcg_scop *createPPCGScop() {
  2307. MustKillsInfo KillsInfo = computeMustKillsInfo(*S);
  2308. auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop));
  2309. PPCGScop->options = createPPCGOptions();
  2310. // enable live range reordering
  2311. PPCGScop->options->live_range_reordering = 1;
  2312. PPCGScop->start = 0;
  2313. PPCGScop->end = 0;
  2314. PPCGScop->context = S->getContext().release();
  2315. PPCGScop->domain = S->getDomains().release();
  2316. // TODO: investigate this further. PPCG calls collect_call_domains.
  2317. PPCGScop->call = isl_union_set_from_set(S->getContext().release());
  2318. PPCGScop->tagged_reads = getTaggedReads();
  2319. PPCGScop->reads = S->getReads().release();
  2320. PPCGScop->live_in = nullptr;
  2321. PPCGScop->tagged_may_writes = getTaggedMayWrites();
  2322. PPCGScop->may_writes = S->getWrites().release();
  2323. PPCGScop->tagged_must_writes = getTaggedMustWrites();
  2324. PPCGScop->must_writes = S->getMustWrites().release();
  2325. PPCGScop->live_out = nullptr;
  2326. PPCGScop->tagged_must_kills = KillsInfo.TaggedMustKills.release();
  2327. PPCGScop->must_kills = KillsInfo.MustKills.release();
  2328. PPCGScop->tagger = nullptr;
  2329. PPCGScop->independence =
  2330. isl_union_map_empty(isl_set_get_space(PPCGScop->context));
  2331. PPCGScop->dep_flow = nullptr;
  2332. PPCGScop->tagged_dep_flow = nullptr;
  2333. PPCGScop->dep_false = nullptr;
  2334. PPCGScop->dep_forced = nullptr;
  2335. PPCGScop->dep_order = nullptr;
  2336. PPCGScop->tagged_dep_order = nullptr;
  2337. PPCGScop->schedule = S->getScheduleTree().release();
  2338. // If we have something non-trivial to kill, add it to the schedule
  2339. if (KillsInfo.KillsSchedule.get())
  2340. PPCGScop->schedule = isl_schedule_sequence(
  2341. PPCGScop->schedule, KillsInfo.KillsSchedule.release());
  2342. PPCGScop->names = getNames();
  2343. PPCGScop->pet = nullptr;
  2344. compute_tagger(PPCGScop);
  2345. compute_dependences(PPCGScop);
  2346. eliminate_dead_code(PPCGScop);
  2347. return PPCGScop;
  2348. }
  2349. /// Collect the array accesses in a statement.
  2350. ///
  2351. /// @param Stmt The statement for which to collect the accesses.
  2352. ///
  2353. /// @returns A list of array accesses.
  2354. gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) {
  2355. gpu_stmt_access *Accesses = nullptr;
  2356. for (MemoryAccess *Acc : Stmt) {
  2357. auto Access =
  2358. isl_alloc_type(S->getIslCtx().get(), struct gpu_stmt_access);
  2359. Access->read = Acc->isRead();
  2360. Access->write = Acc->isWrite();
  2361. Access->access = Acc->getAccessRelation().release();
  2362. isl_space *Space = isl_map_get_space(Access->access);
  2363. Space = isl_space_range(Space);
  2364. Space = isl_space_from_range(Space);
  2365. Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId().release());
  2366. isl_map *Universe = isl_map_universe(Space);
  2367. Access->tagged_access =
  2368. isl_map_domain_product(Acc->getAccessRelation().release(), Universe);
  2369. Access->exact_write = !Acc->isMayWrite();
  2370. Access->ref_id = Acc->getId().release();
  2371. Access->next = Accesses;
  2372. Access->n_index = Acc->getScopArrayInfo()->getNumberOfDimensions();
  2373. // TODO: Also mark one-element accesses to arrays as fixed-element.
  2374. Access->fixed_element =
  2375. Acc->isLatestScalarKind() ? isl_bool_true : isl_bool_false;
  2376. Accesses = Access;
  2377. }
  2378. return Accesses;
  2379. }
  2380. /// Collect the list of GPU statements.
  2381. ///
  2382. /// Each statement has an id, a pointer to the underlying data structure,
  2383. /// as well as a list with all memory accesses.
  2384. ///
  2385. /// TODO: Initialize the list of memory accesses.
  2386. ///
  2387. /// @returns A linked-list of statements.
  2388. gpu_stmt *getStatements() {
  2389. gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx().get(), struct gpu_stmt,
  2390. std::distance(S->begin(), S->end()));
  2391. int i = 0;
  2392. for (auto &Stmt : *S) {
  2393. gpu_stmt *GPUStmt = &Stmts[i];
  2394. GPUStmt->id = Stmt.getDomainId().release();
  2395. // We use the pet stmt pointer to keep track of the Polly statements.
  2396. GPUStmt->stmt = (pet_stmt *)&Stmt;
  2397. GPUStmt->accesses = getStmtAccesses(Stmt);
  2398. i++;
  2399. }
  2400. return Stmts;
  2401. }
  2402. /// Derive the extent of an array.
  2403. ///
  2404. /// The extent of an array is the set of elements that are within the
  2405. /// accessed array. For the inner dimensions, the extent constraints are
  2406. /// 0 and the size of the corresponding array dimension. For the first
  2407. /// (outermost) dimension, the extent constraints are the minimal and maximal
  2408. /// subscript value for the first dimension.
  2409. ///
  2410. /// @param Array The array to derive the extent for.
  2411. ///
  2412. /// @returns An isl_set describing the extent of the array.
  2413. isl::set getExtent(ScopArrayInfo *Array) {
  2414. unsigned NumDims = Array->getNumberOfDimensions();
  2415. if (Array->getNumberOfDimensions() == 0)
  2416. return isl::set::universe(Array->getSpace());
  2417. isl::union_map Accesses = S->getAccesses(Array);
  2418. isl::union_set AccessUSet = Accesses.range();
  2419. AccessUSet = AccessUSet.coalesce();
  2420. AccessUSet = AccessUSet.detect_equalities();
  2421. AccessUSet = AccessUSet.coalesce();
  2422. if (AccessUSet.is_empty())
  2423. return isl::set::empty(Array->getSpace());
  2424. isl::set AccessSet = AccessUSet.extract_set(Array->getSpace());
  2425. isl::local_space LS = isl::local_space(Array->getSpace());
  2426. isl::pw_aff Val = isl::aff::var_on_domain(LS, isl::dim::set, 0);
  2427. isl::pw_aff OuterMin = AccessSet.dim_min(0);
  2428. isl::pw_aff OuterMax = AccessSet.dim_max(0);
  2429. OuterMin = OuterMin.add_dims(isl::dim::in,
  2430. unsignedFromIslSize(Val.dim(isl::dim::in)));
  2431. OuterMax = OuterMax.add_dims(isl::dim::in,
  2432. unsignedFromIslSize(Val.dim(isl::dim::in)));
  2433. OuterMin = OuterMin.set_tuple_id(isl::dim::in, Array->getBasePtrId());
  2434. OuterMax = OuterMax.set_tuple_id(isl::dim::in, Array->getBasePtrId());
  2435. isl::set Extent = isl::set::universe(Array->getSpace());
  2436. Extent = Extent.intersect(OuterMin.le_set(Val));
  2437. Extent = Extent.intersect(OuterMax.ge_set(Val));
  2438. for (unsigned i = 1; i < NumDims; ++i)
  2439. Extent = Extent.lower_bound_si(isl::dim::set, i, 0);
  2440. for (unsigned i = 0; i < NumDims; ++i) {
  2441. isl::pw_aff PwAff = Array->getDimensionSizePw(i);
  2442. // isl_pw_aff can be NULL for zero dimension. Only in the case of a
  2443. // Fortran array will we have a legitimate dimension.
  2444. if (PwAff.is_null()) {
  2445. assert(i == 0 && "invalid dimension isl_pw_aff for nonzero dimension");
  2446. continue;
  2447. }
  2448. isl::pw_aff Val = isl::aff::var_on_domain(
  2449. isl::local_space(Array->getSpace()), isl::dim::set, i);
  2450. PwAff = PwAff.add_dims(isl::dim::in,
  2451. unsignedFromIslSize(Val.dim(isl::dim::in)));
  2452. PwAff = PwAff.set_tuple_id(isl::dim::in, Val.get_tuple_id(isl::dim::in));
  2453. isl::set Set = PwAff.gt_set(Val);
  2454. Extent = Set.intersect(Extent);
  2455. }
  2456. return Extent;
  2457. }
  2458. /// Derive the bounds of an array.
  2459. ///
  2460. /// For the first dimension we derive the bound of the array from the extent
  2461. /// of this dimension. For inner dimensions we obtain their size directly from
  2462. /// ScopArrayInfo.
  2463. ///
  2464. /// @param PPCGArray The array to compute bounds for.
  2465. /// @param Array The polly array from which to take the information.
  2466. void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) {
  2467. std::vector<isl_pw_aff *> Bounds;
  2468. if (PPCGArray.n_index > 0) {
  2469. if (isl_set_is_empty(PPCGArray.extent)) {
  2470. isl_set *Dom = isl_set_copy(PPCGArray.extent);
  2471. isl_local_space *LS = isl_local_space_from_space(
  2472. isl_space_params(isl_set_get_space(Dom)));
  2473. isl_set_free(Dom);
  2474. isl_pw_aff *Zero = isl_pw_aff_from_aff(isl_aff_zero_on_domain(LS));
  2475. Bounds.push_back(Zero);
  2476. } else {
  2477. isl_set *Dom = isl_set_copy(PPCGArray.extent);
  2478. Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1);
  2479. isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0);
  2480. isl_set_free(Dom);
  2481. Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound));
  2482. isl_local_space *LS =
  2483. isl_local_space_from_space(isl_set_get_space(Dom));
  2484. isl_aff *One = isl_aff_zero_on_domain(LS);
  2485. One = isl_aff_add_constant_si(One, 1);
  2486. Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One));
  2487. Bound = isl_pw_aff_gist(Bound, S->getContext().release());
  2488. Bounds.push_back(Bound);
  2489. }
  2490. }
  2491. for (unsigned i = 1; i < PPCGArray.n_index; ++i) {
  2492. isl_pw_aff *Bound = Array->getDimensionSizePw(i).release();
  2493. auto LS = isl_pw_aff_get_domain_space(Bound);
  2494. auto Aff = isl_multi_aff_zero(LS);
  2495. // We need types to work out, which is why we perform this weird dance
  2496. // with `Aff` and `Bound`. Consider this example:
  2497. // LS: [p] -> { [] }
  2498. // Zero: [p] -> { [] } | Implicitly, is [p] -> { ~ -> [] }.
  2499. // This `~` is used to denote a "null space" (which is different from
  2500. // a *zero dimensional* space), which is something that ISL does not
  2501. // show you when pretty printing.
  2502. // Bound: [p] -> { [] -> [(10p)] } | Here, the [] is a *zero dimensional*
  2503. // space, not a "null space" which does not exist at all.
  2504. // When we pullback (precompose) `Bound` with `Zero`, we get:
  2505. // Bound . Zero =
  2506. // ([p] -> { [] -> [(10p)] }) . ([p] -> {~ -> [] }) =
  2507. // [p] -> { ~ -> [(10p)] } =
  2508. // [p] -> [(10p)] (as ISL pretty prints it)
  2509. // Bound Pullback: [p] -> { [(10p)] }
  2510. // We want this kind of an expression for Bound, without a
  2511. // zero dimensional input, but with a "null space" input for the types
  2512. // to work out later on, as far as I (Siddharth Bhat) understand.
  2513. // I was unable to find a reference to this in the ISL manual.
  2514. // References: Tobias Grosser.
  2515. Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff);
  2516. Bounds.push_back(Bound);
  2517. }
  2518. /// To construct a `isl_multi_pw_aff`, we need all the indivisual `pw_aff`
  2519. /// to have the same parameter dimensions. So, we need to align them to an
  2520. /// appropriate space.
  2521. /// Scop::Context is _not_ an appropriate space, because when we have
  2522. /// `-polly-ignore-parameter-bounds` enabled, the Scop::Context does not
  2523. /// contain all parameter dimensions.
  2524. /// So, use the helper `alignPwAffs` to align all the `isl_pw_aff` together.
  2525. isl_space *SeedAlignSpace = S->getParamSpace().release();
  2526. SeedAlignSpace = isl_space_add_dims(SeedAlignSpace, isl_dim_set, 1);
  2527. isl_space *AlignSpace = nullptr;
  2528. std::vector<isl_pw_aff *> AlignedBounds;
  2529. std::tie(AlignSpace, AlignedBounds) =
  2530. alignPwAffs(std::move(Bounds), SeedAlignSpace);
  2531. assert(AlignSpace && "alignPwAffs did not initialise AlignSpace");
  2532. isl_pw_aff_list *BoundsList =
  2533. createPwAffList(S->getIslCtx().get(), std::move(AlignedBounds));
  2534. isl_space *BoundsSpace = isl_set_get_space(PPCGArray.extent);
  2535. BoundsSpace = isl_space_align_params(BoundsSpace, AlignSpace);
  2536. assert(BoundsSpace && "Unable to access space of array.");
  2537. assert(BoundsList && "Unable to access list of bounds.");
  2538. PPCGArray.bound =
  2539. isl_multi_pw_aff_from_pw_aff_list(BoundsSpace, BoundsList);
  2540. assert(PPCGArray.bound && "PPCGArray.bound was not constructed correctly.");
  2541. }
  2542. /// Create the arrays for @p PPCGProg.
  2543. ///
  2544. /// @param PPCGProg The program to compute the arrays for.
  2545. void createArrays(gpu_prog *PPCGProg,
  2546. const SmallVector<ScopArrayInfo *, 4> &ValidSAIs) {
  2547. int i = 0;
  2548. for (auto &Array : ValidSAIs) {
  2549. std::string TypeName;
  2550. raw_string_ostream OS(TypeName);
  2551. OS << *Array->getElementType();
  2552. TypeName = OS.str();
  2553. gpu_array_info &PPCGArray = PPCGProg->array[i];
  2554. PPCGArray.space = Array->getSpace().release();
  2555. PPCGArray.type = strdup(TypeName.c_str());
  2556. PPCGArray.size = DL->getTypeAllocSize(Array->getElementType());
  2557. PPCGArray.name = strdup(Array->getName().c_str());
  2558. PPCGArray.extent = nullptr;
  2559. PPCGArray.n_index = Array->getNumberOfDimensions();
  2560. PPCGArray.extent = getExtent(Array).release();
  2561. PPCGArray.n_ref = 0;
  2562. PPCGArray.refs = nullptr;
  2563. PPCGArray.accessed = true;
  2564. PPCGArray.read_only_scalar =
  2565. Array->isReadOnly() && Array->getNumberOfDimensions() == 0;
  2566. PPCGArray.has_compound_element = false;
  2567. PPCGArray.local = false;
  2568. PPCGArray.declare_local = false;
  2569. PPCGArray.global = false;
  2570. PPCGArray.linearize = false;
  2571. PPCGArray.dep_order = nullptr;
  2572. PPCGArray.user = Array;
  2573. PPCGArray.bound = nullptr;
  2574. setArrayBounds(PPCGArray, Array);
  2575. i++;
  2576. collect_references(PPCGProg, &PPCGArray);
  2577. PPCGArray.only_fixed_element = only_fixed_element_accessed(&PPCGArray);
  2578. }
  2579. }
  2580. /// Create an identity map between the arrays in the scop.
  2581. ///
  2582. /// @returns An identity map between the arrays in the scop.
  2583. isl_union_map *getArrayIdentity() {
  2584. isl_union_map *Maps = isl_union_map_empty(S->getParamSpace().release());
  2585. for (auto &Array : S->arrays()) {
  2586. isl_space *Space = Array->getSpace().release();
  2587. Space = isl_space_map_from_set(Space);
  2588. isl_map *Identity = isl_map_identity(Space);
  2589. Maps = isl_union_map_add_map(Maps, Identity);
  2590. }
  2591. return Maps;
  2592. }
  2593. /// Create a default-initialized PPCG GPU program.
  2594. ///
  2595. /// @returns A new gpu program description.
  2596. gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) {
  2597. if (!PPCGScop)
  2598. return nullptr;
  2599. auto PPCGProg = isl_calloc_type(S->getIslCtx().get(), struct gpu_prog);
  2600. PPCGProg->ctx = S->getIslCtx().get();
  2601. PPCGProg->scop = PPCGScop;
  2602. PPCGProg->context = isl_set_copy(PPCGScop->context);
  2603. PPCGProg->read = isl_union_map_copy(PPCGScop->reads);
  2604. PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes);
  2605. PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes);
  2606. PPCGProg->tagged_must_kill =
  2607. isl_union_map_copy(PPCGScop->tagged_must_kills);
  2608. PPCGProg->to_inner = getArrayIdentity();
  2609. PPCGProg->to_outer = getArrayIdentity();
  2610. // TODO: verify that this assignment is correct.
  2611. PPCGProg->any_to_outer = nullptr;
  2612. PPCGProg->n_stmts = std::distance(S->begin(), S->end());
  2613. PPCGProg->stmts = getStatements();
  2614. // Only consider arrays that have a non-empty extent.
  2615. // Otherwise, this will cause us to consider the following kinds of
  2616. // empty arrays:
  2617. // 1. Invariant loads that are represented by SAI objects.
  2618. // 2. Arrays with statically known zero size.
  2619. auto ValidSAIsRange =
  2620. make_filter_range(S->arrays(), [this](ScopArrayInfo *SAI) -> bool {
  2621. return !getExtent(SAI).is_empty();
  2622. });
  2623. SmallVector<ScopArrayInfo *, 4> ValidSAIs(ValidSAIsRange.begin(),
  2624. ValidSAIsRange.end());
  2625. PPCGProg->n_array =
  2626. ValidSAIs.size(); // std::distance(S->array_begin(), S->array_end());
  2627. PPCGProg->array = isl_calloc_array(
  2628. S->getIslCtx().get(), struct gpu_array_info, PPCGProg->n_array);
  2629. createArrays(PPCGProg, ValidSAIs);
  2630. PPCGProg->array_order = nullptr;
  2631. collect_order_dependences(PPCGProg);
  2632. PPCGProg->may_persist = compute_may_persist(PPCGProg);
  2633. return PPCGProg;
  2634. }
  2635. struct PrintGPUUserData {
  2636. struct cuda_info *CudaInfo;
  2637. struct gpu_prog *PPCGProg;
  2638. std::vector<ppcg_kernel *> Kernels;
  2639. };
  2640. /// Print a user statement node in the host code.
  2641. ///
  2642. /// We use ppcg's printing facilities to print the actual statement and
  2643. /// additionally build up a list of all kernels that are encountered in the
  2644. /// host ast.
  2645. ///
  2646. /// @param P The printer to print to
  2647. /// @param Options The printing options to use
  2648. /// @param Node The node to print
  2649. /// @param User A user pointer to carry additional data. This pointer is
  2650. /// expected to be of type PrintGPUUserData.
  2651. ///
  2652. /// @returns A printer to which the output has been printed.
  2653. static __isl_give isl_printer *
  2654. printHostUser(__isl_take isl_printer *P,
  2655. __isl_take isl_ast_print_options *Options,
  2656. __isl_take isl_ast_node *Node, void *User) {
  2657. auto Data = (struct PrintGPUUserData *)User;
  2658. auto Id = isl_ast_node_get_annotation(Node);
  2659. if (Id) {
  2660. bool IsUser = !strcmp(isl_id_get_name(Id), "user");
  2661. // If this is a user statement, format it ourselves as ppcg would
  2662. // otherwise try to call pet functionality that is not available in
  2663. // Polly.
  2664. if (IsUser) {
  2665. P = isl_printer_start_line(P);
  2666. P = isl_printer_print_ast_node(P, Node);
  2667. P = isl_printer_end_line(P);
  2668. isl_id_free(Id);
  2669. isl_ast_print_options_free(Options);
  2670. return P;
  2671. }
  2672. auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id);
  2673. isl_id_free(Id);
  2674. Data->Kernels.push_back(Kernel);
  2675. }
  2676. return print_host_user(P, Options, Node, User);
  2677. }
  2678. /// Print C code corresponding to the control flow in @p Kernel.
  2679. ///
  2680. /// @param Kernel The kernel to print
  2681. void printKernel(ppcg_kernel *Kernel) {
  2682. auto *P = isl_printer_to_str(S->getIslCtx().get());
  2683. P = isl_printer_set_output_format(P, ISL_FORMAT_C);
  2684. auto *Options = isl_ast_print_options_alloc(S->getIslCtx().get());
  2685. P = isl_ast_node_print(Kernel->tree, P, Options);
  2686. char *String = isl_printer_get_str(P);
  2687. outs() << String << "\n";
  2688. free(String);
  2689. isl_printer_free(P);
  2690. }
  2691. /// Print C code corresponding to the GPU code described by @p Tree.
  2692. ///
  2693. /// @param Tree An AST describing GPU code
  2694. /// @param PPCGProg The PPCG program from which @Tree has been constructed.
  2695. void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) {
  2696. auto *P = isl_printer_to_str(S->getIslCtx().get());
  2697. P = isl_printer_set_output_format(P, ISL_FORMAT_C);
  2698. PrintGPUUserData Data;
  2699. Data.PPCGProg = PPCGProg;
  2700. auto *Options = isl_ast_print_options_alloc(S->getIslCtx().get());
  2701. Options =
  2702. isl_ast_print_options_set_print_user(Options, printHostUser, &Data);
  2703. P = isl_ast_node_print(Tree, P, Options);
  2704. char *String = isl_printer_get_str(P);
  2705. outs() << "# host\n";
  2706. outs() << String << "\n";
  2707. free(String);
  2708. isl_printer_free(P);
  2709. for (auto Kernel : Data.Kernels) {
  2710. outs() << "# kernel" << Kernel->id << "\n";
  2711. printKernel(Kernel);
  2712. }
  2713. }
  2714. // Generate a GPU program using PPCG.
  2715. //
  2716. // GPU mapping consists of multiple steps:
  2717. //
  2718. // 1) Compute new schedule for the program.
  2719. // 2) Map schedule to GPU (TODO)
  2720. // 3) Generate code for new schedule (TODO)
  2721. //
  2722. // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer
  2723. // is mostly CPU specific. Instead, we use PPCG's GPU code generation
  2724. // strategy directly from this pass.
  2725. gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) {
  2726. auto PPCGGen = isl_calloc_type(S->getIslCtx().get(), struct gpu_gen);
  2727. PPCGGen->ctx = S->getIslCtx().get();
  2728. PPCGGen->options = PPCGScop->options;
  2729. PPCGGen->print = nullptr;
  2730. PPCGGen->print_user = nullptr;
  2731. PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt;
  2732. PPCGGen->prog = PPCGProg;
  2733. PPCGGen->tree = nullptr;
  2734. PPCGGen->types.n = 0;
  2735. PPCGGen->types.name = nullptr;
  2736. PPCGGen->sizes = nullptr;
  2737. PPCGGen->used_sizes = nullptr;
  2738. PPCGGen->kernel_id = 0;
  2739. // Set scheduling strategy to same strategy PPCG is using.
  2740. isl_options_set_schedule_serialize_sccs(PPCGGen->ctx, false);
  2741. isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true);
  2742. isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true);
  2743. isl_options_set_schedule_whole_component(PPCGGen->ctx, false);
  2744. isl_schedule *Schedule = get_schedule(PPCGGen);
  2745. int has_permutable = has_any_permutable_node(Schedule);
  2746. Schedule =
  2747. isl_schedule_align_params(Schedule, S->getFullParamSpace().release());
  2748. if (!has_permutable || has_permutable < 0) {
  2749. Schedule = isl_schedule_free(Schedule);
  2750. LLVM_DEBUG(dbgs() << getUniqueScopName(S)
  2751. << " does not have permutable bands. Bailing out\n";);
  2752. } else {
  2753. const bool CreateTransferToFromDevice = !PollyManagedMemory;
  2754. Schedule = map_to_device(PPCGGen, Schedule, CreateTransferToFromDevice);
  2755. PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule));
  2756. }
  2757. if (DumpSchedule) {
  2758. isl_printer *P = isl_printer_to_str(S->getIslCtx().get());
  2759. P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
  2760. P = isl_printer_print_str(P, "Schedule\n");
  2761. P = isl_printer_print_str(P, "========\n");
  2762. if (Schedule)
  2763. P = isl_printer_print_schedule(P, Schedule);
  2764. else
  2765. P = isl_printer_print_str(P, "No schedule found\n");
  2766. outs() << isl_printer_get_str(P) << "\n";
  2767. isl_printer_free(P);
  2768. }
  2769. if (DumpCode) {
  2770. outs() << "Code\n";
  2771. outs() << "====\n";
  2772. if (PPCGGen->tree)
  2773. printGPUTree(PPCGGen->tree, PPCGProg);
  2774. else
  2775. outs() << "No code generated\n";
  2776. }
  2777. isl_schedule_free(Schedule);
  2778. return PPCGGen;
  2779. }
  2780. /// Free gpu_gen structure.
  2781. ///
  2782. /// @param PPCGGen The ppcg_gen object to free.
  2783. void freePPCGGen(gpu_gen *PPCGGen) {
  2784. isl_ast_node_free(PPCGGen->tree);
  2785. isl_union_map_free(PPCGGen->sizes);
  2786. isl_union_map_free(PPCGGen->used_sizes);
  2787. free(PPCGGen);
  2788. }
  2789. /// Free the options in the ppcg scop structure.
  2790. ///
  2791. /// ppcg is not freeing these options for us. To avoid leaks we do this
  2792. /// ourselves.
  2793. ///
  2794. /// @param PPCGScop The scop referencing the options to free.
  2795. void freeOptions(ppcg_scop *PPCGScop) {
  2796. free(PPCGScop->options->debug);
  2797. PPCGScop->options->debug = nullptr;
  2798. free(PPCGScop->options);
  2799. PPCGScop->options = nullptr;
  2800. }
  2801. /// Approximate the number of points in the set.
  2802. ///
  2803. /// This function returns an ast expression that overapproximates the number
  2804. /// of points in an isl set through the rectangular hull surrounding this set.
  2805. ///
  2806. /// @param Set The set to count.
  2807. /// @param Build The isl ast build object to use for creating the ast
  2808. /// expression.
  2809. ///
  2810. /// @returns An approximation of the number of points in the set.
  2811. __isl_give isl_ast_expr *approxPointsInSet(__isl_take isl_set *Set,
  2812. __isl_keep isl_ast_build *Build) {
  2813. isl_val *One = isl_val_int_from_si(isl_set_get_ctx(Set), 1);
  2814. auto *Expr = isl_ast_expr_from_val(isl_val_copy(One));
  2815. isl_space *Space = isl_set_get_space(Set);
  2816. Space = isl_space_params(Space);
  2817. auto *Univ = isl_set_universe(Space);
  2818. isl_pw_aff *OneAff = isl_pw_aff_val_on_domain(Univ, One);
  2819. for (long i = 0, n = isl_set_dim(Set, isl_dim_set); i < n; i++) {
  2820. isl_pw_aff *Max = isl_set_dim_max(isl_set_copy(Set), i);
  2821. isl_pw_aff *Min = isl_set_dim_min(isl_set_copy(Set), i);
  2822. isl_pw_aff *DimSize = isl_pw_aff_sub(Max, Min);
  2823. DimSize = isl_pw_aff_add(DimSize, isl_pw_aff_copy(OneAff));
  2824. auto DimSizeExpr = isl_ast_build_expr_from_pw_aff(Build, DimSize);
  2825. Expr = isl_ast_expr_mul(Expr, DimSizeExpr);
  2826. }
  2827. isl_set_free(Set);
  2828. isl_pw_aff_free(OneAff);
  2829. return Expr;
  2830. }
  2831. /// Approximate a number of dynamic instructions executed by a given
  2832. /// statement.
  2833. ///
  2834. /// @param Stmt The statement for which to compute the number of dynamic
  2835. /// instructions.
  2836. /// @param Build The isl ast build object to use for creating the ast
  2837. /// expression.
  2838. /// @returns An approximation of the number of dynamic instructions executed
  2839. /// by @p Stmt.
  2840. __isl_give isl_ast_expr *approxDynamicInst(ScopStmt &Stmt,
  2841. __isl_keep isl_ast_build *Build) {
  2842. auto Iterations = approxPointsInSet(Stmt.getDomain().release(), Build);
  2843. long InstCount = 0;
  2844. if (Stmt.isBlockStmt()) {
  2845. auto *BB = Stmt.getBasicBlock();
  2846. InstCount = std::distance(BB->begin(), BB->end());
  2847. } else {
  2848. auto *R = Stmt.getRegion();
  2849. for (auto *BB : R->blocks()) {
  2850. InstCount += std::distance(BB->begin(), BB->end());
  2851. }
  2852. }
  2853. isl_val *InstVal = isl_val_int_from_si(S->getIslCtx().get(), InstCount);
  2854. auto *InstExpr = isl_ast_expr_from_val(InstVal);
  2855. return isl_ast_expr_mul(InstExpr, Iterations);
  2856. }
  2857. /// Approximate dynamic instructions executed in scop.
  2858. ///
  2859. /// @param S The scop for which to approximate dynamic instructions.
  2860. /// @param Build The isl ast build object to use for creating the ast
  2861. /// expression.
  2862. /// @returns An approximation of the number of dynamic instructions executed
  2863. /// in @p S.
  2864. __isl_give isl_ast_expr *
  2865. getNumberOfIterations(Scop &S, __isl_keep isl_ast_build *Build) {
  2866. isl_ast_expr *Instructions;
  2867. isl_val *Zero = isl_val_int_from_si(S.getIslCtx().get(), 0);
  2868. Instructions = isl_ast_expr_from_val(Zero);
  2869. for (ScopStmt &Stmt : S) {
  2870. isl_ast_expr *StmtInstructions = approxDynamicInst(Stmt, Build);
  2871. Instructions = isl_ast_expr_add(Instructions, StmtInstructions);
  2872. }
  2873. return Instructions;
  2874. }
  2875. /// Create a check that ensures sufficient compute in scop.
  2876. ///
  2877. /// @param S The scop for which to ensure sufficient compute.
  2878. /// @param Build The isl ast build object to use for creating the ast
  2879. /// expression.
  2880. /// @returns An expression that evaluates to TRUE in case of sufficient
  2881. /// compute and to FALSE, otherwise.
  2882. __isl_give isl_ast_expr *
  2883. createSufficientComputeCheck(Scop &S, __isl_keep isl_ast_build *Build) {
  2884. auto Iterations = getNumberOfIterations(S, Build);
  2885. auto *MinComputeVal = isl_val_int_from_si(S.getIslCtx().get(), MinCompute);
  2886. auto *MinComputeExpr = isl_ast_expr_from_val(MinComputeVal);
  2887. return isl_ast_expr_ge(Iterations, MinComputeExpr);
  2888. }
  2889. /// Check if the basic block contains a function we cannot codegen for GPU
  2890. /// kernels.
  2891. ///
  2892. /// If this basic block does something with a `Function` other than calling
  2893. /// a function that we support in a kernel, return true.
  2894. bool containsInvalidKernelFunctionInBlock(const BasicBlock *BB,
  2895. bool AllowCUDALibDevice) {
  2896. for (const Instruction &Inst : *BB) {
  2897. const CallInst *Call = dyn_cast<CallInst>(&Inst);
  2898. if (Call && isValidFunctionInKernel(Call->getCalledFunction(),
  2899. AllowCUDALibDevice))
  2900. continue;
  2901. for (Value *Op : Inst.operands())
  2902. // Look for (<func-type>*) among operands of Inst
  2903. if (auto PtrTy = dyn_cast<PointerType>(Op->getType())) {
  2904. if (isa<FunctionType>(PtrTy->getElementType())) {
  2905. LLVM_DEBUG(dbgs()
  2906. << Inst << " has illegal use of function in kernel.\n");
  2907. return true;
  2908. }
  2909. }
  2910. }
  2911. return false;
  2912. }
  2913. /// Return whether the Scop S uses functions in a way that we do not support.
  2914. bool containsInvalidKernelFunction(const Scop &S, bool AllowCUDALibDevice) {
  2915. for (auto &Stmt : S) {
  2916. if (Stmt.isBlockStmt()) {
  2917. if (containsInvalidKernelFunctionInBlock(Stmt.getBasicBlock(),
  2918. AllowCUDALibDevice))
  2919. return true;
  2920. } else {
  2921. assert(Stmt.isRegionStmt() &&
  2922. "Stmt was neither block nor region statement");
  2923. for (const BasicBlock *BB : Stmt.getRegion()->blocks())
  2924. if (containsInvalidKernelFunctionInBlock(BB, AllowCUDALibDevice))
  2925. return true;
  2926. }
  2927. }
  2928. return false;
  2929. }
  2930. /// Generate code for a given GPU AST described by @p Root.
  2931. ///
  2932. /// @param Root An isl_ast_node pointing to the root of the GPU AST.
  2933. /// @param Prog The GPU Program to generate code for.
  2934. void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) {
  2935. ScopAnnotator Annotator;
  2936. Annotator.buildAliasScopes(*S);
  2937. Region *R = &S->getRegion();
  2938. simplifyRegion(R, DT, LI, RI);
  2939. BasicBlock *EnteringBB = R->getEnteringBlock();
  2940. PollyIRBuilder Builder(EnteringBB->getContext(), ConstantFolder(),
  2941. IRInserter(Annotator));
  2942. Builder.SetInsertPoint(EnteringBB->getTerminator());
  2943. // Only build the run-time condition and parameters _after_ having
  2944. // introduced the conditional branch. This is important as the conditional
  2945. // branch will guard the original scop from new induction variables that
  2946. // the SCEVExpander may introduce while code generating the parameters and
  2947. // which may introduce scalar dependences that prevent us from correctly
  2948. // code generating this scop.
  2949. BBPair StartExitBlocks;
  2950. BranchInst *CondBr = nullptr;
  2951. std::tie(StartExitBlocks, CondBr) =
  2952. executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI);
  2953. BasicBlock *StartBlock = std::get<0>(StartExitBlocks);
  2954. assert(CondBr && "CondBr not initialized by executeScopConditionally");
  2955. GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S,
  2956. StartBlock, Prog, Runtime, Architecture);
  2957. // TODO: Handle LICM
  2958. auto SplitBlock = StartBlock->getSinglePredecessor();
  2959. Builder.SetInsertPoint(SplitBlock->getTerminator());
  2960. isl_ast_build *Build = isl_ast_build_alloc(S->getIslCtx().get());
  2961. isl::ast_expr Condition =
  2962. IslAst::buildRunCondition(*S, isl::manage_copy(Build));
  2963. isl_ast_expr *SufficientCompute = createSufficientComputeCheck(*S, Build);
  2964. Condition =
  2965. isl::manage(isl_ast_expr_and(Condition.release(), SufficientCompute));
  2966. isl_ast_build_free(Build);
  2967. // preload invariant loads. Note: This should happen before the RTC
  2968. // because the RTC may depend on values that are invariant load hoisted.
  2969. if (!NodeBuilder.preloadInvariantLoads()) {
  2970. // Patch the introduced branch condition to ensure that we always execute
  2971. // the original SCoP.
  2972. auto *FalseI1 = Builder.getFalse();
  2973. auto *SplitBBTerm = Builder.GetInsertBlock()->getTerminator();
  2974. SplitBBTerm->setOperand(0, FalseI1);
  2975. LLVM_DEBUG(dbgs() << "preloading invariant loads failed in function: " +
  2976. S->getFunction().getName() +
  2977. " | Scop Region: " + S->getNameStr());
  2978. // adjust the dominator tree accordingly.
  2979. auto *ExitingBlock = StartBlock->getUniqueSuccessor();
  2980. assert(ExitingBlock);
  2981. auto *MergeBlock = ExitingBlock->getUniqueSuccessor();
  2982. assert(MergeBlock);
  2983. polly::markBlockUnreachable(*StartBlock, Builder);
  2984. polly::markBlockUnreachable(*ExitingBlock, Builder);
  2985. auto *ExitingBB = S->getExitingBlock();
  2986. assert(ExitingBB);
  2987. DT->changeImmediateDominator(MergeBlock, ExitingBB);
  2988. DT->eraseNode(ExitingBlock);
  2989. isl_ast_node_free(Root);
  2990. } else {
  2991. if (polly::PerfMonitoring) {
  2992. PerfMonitor P(*S, EnteringBB->getParent()->getParent());
  2993. P.initialize();
  2994. P.insertRegionStart(SplitBlock->getTerminator());
  2995. // TODO: actually think if this is the correct exiting block to place
  2996. // the `end` performance marker. Invariant load hoisting changes
  2997. // the CFG in a way that I do not precisely understand, so I
  2998. // (Siddharth<siddu.druid@gmail.com>) should come back to this and
  2999. // think about which exiting block to use.
  3000. auto *ExitingBlock = StartBlock->getUniqueSuccessor();
  3001. assert(ExitingBlock);
  3002. BasicBlock *MergeBlock = ExitingBlock->getUniqueSuccessor();
  3003. P.insertRegionEnd(MergeBlock->getTerminator());
  3004. }
  3005. NodeBuilder.addParameters(S->getContext().release());
  3006. Value *RTC = NodeBuilder.createRTC(Condition.release());
  3007. Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC);
  3008. Builder.SetInsertPoint(&*StartBlock->begin());
  3009. NodeBuilder.create(Root);
  3010. }
  3011. /// In case a sequential kernel has more surrounding loops as any parallel
  3012. /// kernel, the SCoP is probably mostly sequential. Hence, there is no
  3013. /// point in running it on a GPU.
  3014. if (NodeBuilder.DeepestSequential > NodeBuilder.DeepestParallel)
  3015. CondBr->setOperand(0, Builder.getFalse());
  3016. if (!NodeBuilder.BuildSuccessful)
  3017. CondBr->setOperand(0, Builder.getFalse());
  3018. }
  3019. bool runOnScop(Scop &CurrentScop) override {
  3020. S = &CurrentScop;
  3021. LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  3022. DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  3023. SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  3024. DL = &S->getRegion().getEntry()->getModule()->getDataLayout();
  3025. RI = &getAnalysis<RegionInfoPass>().getRegionInfo();
  3026. LLVM_DEBUG(dbgs() << "PPCGCodeGen running on : " << getUniqueScopName(S)
  3027. << " | loop depth: " << S->getMaxLoopDepth() << "\n");
  3028. // We currently do not support functions other than intrinsics inside
  3029. // kernels, as code generation will need to offload function calls to the
  3030. // kernel. This may lead to a kernel trying to call a function on the host.
  3031. // This also allows us to prevent codegen from trying to take the
  3032. // address of an intrinsic function to send to the kernel.
  3033. if (containsInvalidKernelFunction(CurrentScop,
  3034. Architecture == GPUArch::NVPTX64)) {
  3035. LLVM_DEBUG(
  3036. dbgs() << getUniqueScopName(S)
  3037. << " contains function which cannot be materialised in a GPU "
  3038. "kernel. Bailing out.\n";);
  3039. return false;
  3040. }
  3041. auto PPCGScop = createPPCGScop();
  3042. auto PPCGProg = createPPCGProg(PPCGScop);
  3043. auto PPCGGen = generateGPU(PPCGScop, PPCGProg);
  3044. if (PPCGGen->tree) {
  3045. generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg);
  3046. CurrentScop.markAsToBeSkipped();
  3047. } else {
  3048. LLVM_DEBUG(dbgs() << getUniqueScopName(S)
  3049. << " has empty PPCGGen->tree. Bailing out.\n");
  3050. }
  3051. freeOptions(PPCGScop);
  3052. freePPCGGen(PPCGGen);
  3053. gpu_prog_free(PPCGProg);
  3054. ppcg_scop_free(PPCGScop);
  3055. return true;
  3056. }
  3057. void printScop(raw_ostream &, Scop &) const override {}
  3058. void getAnalysisUsage(AnalysisUsage &AU) const override {
  3059. ScopPass::getAnalysisUsage(AU);
  3060. AU.addRequired<DominatorTreeWrapperPass>();
  3061. AU.addRequired<RegionInfoPass>();
  3062. AU.addRequired<ScalarEvolutionWrapperPass>();
  3063. AU.addRequired<ScopDetectionWrapperPass>();
  3064. AU.addRequired<ScopInfoRegionPass>();
  3065. AU.addRequired<LoopInfoWrapperPass>();
  3066. // FIXME: We do not yet add regions for the newly generated code to the
  3067. // region tree.
  3068. }
  3069. };
  3070. } // namespace
  3071. char PPCGCodeGeneration::ID = 1;
  3072. Pass *polly::createPPCGCodeGenerationPass(GPUArch Arch, GPURuntime Runtime) {
  3073. PPCGCodeGeneration *generator = new PPCGCodeGeneration();
  3074. generator->Runtime = Runtime;
  3075. generator->Architecture = Arch;
  3076. return generator;
  3077. }
  3078. INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg",
  3079. "Polly - Apply PPCG translation to SCOP", false, false)
  3080. INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
  3081. INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
  3082. INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
  3083. INITIALIZE_PASS_DEPENDENCY(RegionInfoPass);
  3084. INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass);
  3085. INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass);
  3086. INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg",
  3087. "Polly - Apply PPCG translation to SCOP", false, false)