PPCGCodeGeneration.cpp 130 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657
  1. //===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // Take a scop created by ScopInfo and map it to GPU code using the ppcg
  10. // GPU mapping strategy.
  11. //
  12. //===----------------------------------------------------------------------===//
  13. #include "polly/CodeGen/PPCGCodeGeneration.h"
  14. #include "polly/CodeGen/CodeGeneration.h"
  15. #include "polly/CodeGen/IslAst.h"
  16. #include "polly/CodeGen/IslNodeBuilder.h"
  17. #include "polly/CodeGen/PerfMonitor.h"
  18. #include "polly/CodeGen/Utils.h"
  19. #include "polly/DependenceInfo.h"
  20. #include "polly/LinkAllPasses.h"
  21. #include "polly/Options.h"
  22. #include "polly/ScopDetection.h"
  23. #include "polly/ScopInfo.h"
  24. #include "polly/Support/ISLTools.h"
  25. #include "polly/Support/SCEVValidator.h"
  26. #include "llvm/ADT/PostOrderIterator.h"
  27. #include "llvm/Analysis/TargetTransformInfo.h"
  28. #include "llvm/IR/IntrinsicsNVPTX.h"
  29. #include "llvm/IR/LegacyPassManager.h"
  30. #include "llvm/IR/Verifier.h"
  31. #include "llvm/IRReader/IRReader.h"
  32. #include "llvm/InitializePasses.h"
  33. #include "llvm/Linker/Linker.h"
  34. #include "llvm/MC/TargetRegistry.h"
  35. #include "llvm/Support/SourceMgr.h"
  36. #include "llvm/Target/TargetMachine.h"
  37. #include "llvm/Transforms/IPO/PassManagerBuilder.h"
  38. #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  39. #include "isl/union_map.h"
  40. #include <algorithm>
  41. extern "C" {
  42. #include "ppcg/cuda.h"
  43. #include "ppcg/gpu.h"
  44. #include "ppcg/ppcg.h"
  45. }
  46. #include "llvm/Support/Debug.h"
  47. using namespace polly;
  48. using namespace llvm;
  49. #define DEBUG_TYPE "polly-codegen-ppcg"
  50. static cl::opt<bool> DumpSchedule("polly-acc-dump-schedule",
  51. cl::desc("Dump the computed GPU Schedule"),
  52. cl::Hidden, cl::cat(PollyCategory));
  53. static cl::opt<bool>
  54. DumpCode("polly-acc-dump-code",
  55. cl::desc("Dump C code describing the GPU mapping"), cl::Hidden,
  56. cl::cat(PollyCategory));
  57. static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir",
  58. cl::desc("Dump the kernel LLVM-IR"),
  59. cl::Hidden, cl::cat(PollyCategory));
  60. static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm",
  61. cl::desc("Dump the kernel assembly code"),
  62. cl::Hidden, cl::cat(PollyCategory));
  63. static cl::opt<bool> FastMath("polly-acc-fastmath",
  64. cl::desc("Allow unsafe math optimizations"),
  65. cl::Hidden, cl::cat(PollyCategory));
  66. static cl::opt<bool> SharedMemory("polly-acc-use-shared",
  67. cl::desc("Use shared memory"), cl::Hidden,
  68. cl::cat(PollyCategory));
  69. static cl::opt<bool> PrivateMemory("polly-acc-use-private",
  70. cl::desc("Use private memory"), cl::Hidden,
  71. cl::cat(PollyCategory));
  72. bool polly::PollyManagedMemory;
  73. static cl::opt<bool, true>
  74. XManagedMemory("polly-acc-codegen-managed-memory",
  75. cl::desc("Generate Host kernel code assuming"
  76. " that all memory has been"
  77. " declared as managed memory"),
  78. cl::location(PollyManagedMemory), cl::Hidden,
  79. cl::init(false), cl::cat(PollyCategory));
  80. static cl::opt<bool>
  81. FailOnVerifyModuleFailure("polly-acc-fail-on-verify-module-failure",
  82. cl::desc("Fail and generate a backtrace if"
  83. " verifyModule fails on the GPU "
  84. " kernel module."),
  85. cl::Hidden, cl::cat(PollyCategory));
  86. static cl::opt<std::string> CUDALibDevice(
  87. "polly-acc-libdevice", cl::desc("Path to CUDA libdevice"), cl::Hidden,
  88. cl::init("/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.ll"),
  89. cl::cat(PollyCategory));
  90. static cl::opt<std::string>
  91. CudaVersion("polly-acc-cuda-version",
  92. cl::desc("The CUDA version to compile for"), cl::Hidden,
  93. cl::init("sm_30"), cl::cat(PollyCategory));
  94. static cl::opt<int>
  95. MinCompute("polly-acc-mincompute",
  96. cl::desc("Minimal number of compute statements to run on GPU."),
  97. cl::Hidden, cl::init(10 * 512 * 512));
  98. GPURuntime polly::GPURuntimeChoice;
  99. static cl::opt<GPURuntime, true>
  100. XGPURuntimeChoice("polly-gpu-runtime",
  101. cl::desc("The GPU Runtime API to target"),
  102. cl::values(clEnumValN(GPURuntime::CUDA, "libcudart",
  103. "use the CUDA Runtime API"),
  104. clEnumValN(GPURuntime::OpenCL, "libopencl",
  105. "use the OpenCL Runtime API")),
  106. cl::location(polly::GPURuntimeChoice),
  107. cl::init(GPURuntime::CUDA), cl::cat(PollyCategory));
  108. GPUArch polly::GPUArchChoice;
  109. static cl::opt<GPUArch, true>
  110. XGPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"),
  111. cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64",
  112. "target NVIDIA 64-bit architecture"),
  113. clEnumValN(GPUArch::SPIR32, "spir32",
  114. "target SPIR 32-bit architecture"),
  115. clEnumValN(GPUArch::SPIR64, "spir64",
  116. "target SPIR 64-bit architecture")),
  117. cl::location(polly::GPUArchChoice),
  118. cl::init(GPUArch::NVPTX64), cl::cat(PollyCategory));
  119. extern bool polly::PerfMonitoring;
  120. /// Return a unique name for a Scop, which is the scop region with the
  121. /// function name.
  122. std::string getUniqueScopName(const Scop *S) {
  123. return "Scop Region: " + S->getNameStr() +
  124. " | Function: " + std::string(S->getFunction().getName());
  125. }
  126. /// Used to store information PPCG wants for kills. This information is
  127. /// used by live range reordering.
  128. ///
  129. /// @see computeLiveRangeReordering
  130. /// @see GPUNodeBuilder::createPPCGScop
  131. /// @see GPUNodeBuilder::createPPCGProg
  132. struct MustKillsInfo {
  133. /// Collection of all kill statements that will be sequenced at the end of
  134. /// PPCGScop->schedule.
  135. ///
  136. /// The nodes in `KillsSchedule` will be merged using `isl_schedule_set`
  137. /// which merges schedules in *arbitrary* order.
  138. /// (we don't care about the order of the kills anyway).
  139. isl::schedule KillsSchedule;
  140. /// Map from kill statement instances to scalars that need to be
  141. /// killed.
  142. ///
  143. /// We currently derive kill information for:
  144. /// 1. phi nodes. PHI nodes are not alive outside the scop and can
  145. /// consequently all be killed.
  146. /// 2. Scalar arrays that are not used outside the Scop. This is
  147. /// checked by `isScalarUsesContainedInScop`.
  148. /// [params] -> { [Stmt_phantom[] -> ref_phantom[]] -> scalar_to_kill[] }
  149. isl::union_map TaggedMustKills;
  150. /// Tagged must kills stripped of the tags.
  151. /// [params] -> { Stmt_phantom[] -> scalar_to_kill[] }
  152. isl::union_map MustKills;
  153. MustKillsInfo() : KillsSchedule() {}
  154. };
  155. /// Check if SAI's uses are entirely contained within Scop S.
  156. /// If a scalar is used only with a Scop, we are free to kill it, as no data
  157. /// can flow in/out of the value any more.
  158. /// @see computeMustKillsInfo
  159. static bool isScalarUsesContainedInScop(const Scop &S,
  160. const ScopArrayInfo *SAI) {
  161. assert(SAI->isValueKind() && "this function only deals with scalars."
  162. " Dealing with arrays required alias analysis");
  163. const Region &R = S.getRegion();
  164. for (User *U : SAI->getBasePtr()->users()) {
  165. Instruction *I = dyn_cast<Instruction>(U);
  166. assert(I && "invalid user of scop array info");
  167. if (!R.contains(I))
  168. return false;
  169. }
  170. return true;
  171. }
  172. /// Compute must-kills needed to enable live range reordering with PPCG.
  173. ///
  174. /// @params S The Scop to compute live range reordering information
  175. /// @returns live range reordering information that can be used to setup
  176. /// PPCG.
  177. static MustKillsInfo computeMustKillsInfo(const Scop &S) {
  178. const isl::space ParamSpace = S.getParamSpace();
  179. MustKillsInfo Info;
  180. // 1. Collect all ScopArrayInfo that satisfy *any* of the criteria:
  181. // 1.1 phi nodes in scop.
  182. // 1.2 scalars that are only used within the scop
  183. SmallVector<isl::id, 4> KillMemIds;
  184. for (ScopArrayInfo *SAI : S.arrays()) {
  185. if (SAI->isPHIKind() ||
  186. (SAI->isValueKind() && isScalarUsesContainedInScop(S, SAI)))
  187. KillMemIds.push_back(isl::manage(SAI->getBasePtrId().release()));
  188. }
  189. Info.TaggedMustKills = isl::union_map::empty(ParamSpace.ctx());
  190. Info.MustKills = isl::union_map::empty(ParamSpace.ctx());
  191. // Initialising KillsSchedule to `isl_set_empty` creates an empty node in the
  192. // schedule:
  193. // - filter: "[control] -> { }"
  194. // So, we choose to not create this to keep the output a little nicer,
  195. // at the cost of some code complexity.
  196. Info.KillsSchedule = {};
  197. for (isl::id &ToKillId : KillMemIds) {
  198. isl::id KillStmtId = isl::id::alloc(
  199. S.getIslCtx(),
  200. std::string("SKill_phantom_").append(ToKillId.get_name()), nullptr);
  201. // NOTE: construction of tagged_must_kill:
  202. // 2. We need to construct a map:
  203. // [param] -> { [Stmt_phantom[] -> ref_phantom[]] -> scalar_to_kill[] }
  204. // To construct this, we use `isl_map_domain_product` on 2 maps`:
  205. // 2a. StmtToScalar:
  206. // [param] -> { Stmt_phantom[] -> scalar_to_kill[] }
  207. // 2b. PhantomRefToScalar:
  208. // [param] -> { ref_phantom[] -> scalar_to_kill[] }
  209. //
  210. // Combining these with `isl_map_domain_product` gives us
  211. // TaggedMustKill:
  212. // [param] -> { [Stmt[] -> phantom_ref[]] -> scalar_to_kill[] }
  213. // 2a. [param] -> { Stmt[] -> scalar_to_kill[] }
  214. isl::map StmtToScalar = isl::map::universe(ParamSpace);
  215. StmtToScalar = StmtToScalar.set_tuple_id(isl::dim::in, isl::id(KillStmtId));
  216. StmtToScalar = StmtToScalar.set_tuple_id(isl::dim::out, isl::id(ToKillId));
  217. isl::id PhantomRefId = isl::id::alloc(
  218. S.getIslCtx(), std::string("ref_phantom") + ToKillId.get_name(),
  219. nullptr);
  220. // 2b. [param] -> { phantom_ref[] -> scalar_to_kill[] }
  221. isl::map PhantomRefToScalar = isl::map::universe(ParamSpace);
  222. PhantomRefToScalar =
  223. PhantomRefToScalar.set_tuple_id(isl::dim::in, PhantomRefId);
  224. PhantomRefToScalar =
  225. PhantomRefToScalar.set_tuple_id(isl::dim::out, ToKillId);
  226. // 2. [param] -> { [Stmt[] -> phantom_ref[]] -> scalar_to_kill[] }
  227. isl::map TaggedMustKill = StmtToScalar.domain_product(PhantomRefToScalar);
  228. Info.TaggedMustKills = Info.TaggedMustKills.unite(TaggedMustKill);
  229. // 2. [param] -> { Stmt[] -> scalar_to_kill[] }
  230. Info.MustKills = Info.TaggedMustKills.domain_factor_domain();
  231. // 3. Create the kill schedule of the form:
  232. // "[param] -> { Stmt_phantom[] }"
  233. // Then add this to Info.KillsSchedule.
  234. isl::space KillStmtSpace = ParamSpace;
  235. KillStmtSpace = KillStmtSpace.set_tuple_id(isl::dim::set, KillStmtId);
  236. isl::union_set KillStmtDomain = isl::set::universe(KillStmtSpace);
  237. isl::schedule KillSchedule = isl::schedule::from_domain(KillStmtDomain);
  238. if (!Info.KillsSchedule.is_null())
  239. Info.KillsSchedule = isl::manage(
  240. isl_schedule_set(Info.KillsSchedule.release(), KillSchedule.copy()));
  241. else
  242. Info.KillsSchedule = KillSchedule;
  243. }
  244. return Info;
  245. }
  246. /// Create the ast expressions for a ScopStmt.
  247. ///
  248. /// This function is a callback for to generate the ast expressions for each
  249. /// of the scheduled ScopStmts.
  250. static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt(
  251. void *StmtT, __isl_take isl_ast_build *Build_C,
  252. isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA,
  253. isl_id *Id, void *User),
  254. void *UserIndex,
  255. isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User),
  256. void *UserExpr) {
  257. ScopStmt *Stmt = (ScopStmt *)StmtT;
  258. if (!Stmt || !Build_C)
  259. return NULL;
  260. isl::ast_build Build = isl::manage_copy(Build_C);
  261. isl::ctx Ctx = Build.ctx();
  262. isl::id_to_ast_expr RefToExpr = isl::id_to_ast_expr::alloc(Ctx, 0);
  263. Stmt->setAstBuild(Build);
  264. for (MemoryAccess *Acc : *Stmt) {
  265. isl::map AddrFunc = Acc->getAddressFunction();
  266. AddrFunc = AddrFunc.intersect_domain(Stmt->getDomain());
  267. isl::id RefId = Acc->getId();
  268. isl::pw_multi_aff PMA = isl::pw_multi_aff::from_map(AddrFunc);
  269. isl::multi_pw_aff MPA = isl::multi_pw_aff(PMA);
  270. MPA = MPA.coalesce();
  271. MPA = isl::manage(FunctionIndex(MPA.release(), RefId.get(), UserIndex));
  272. isl::ast_expr Access = Build.access_from(MPA);
  273. Access = isl::manage(FunctionExpr(Access.release(), RefId.get(), UserExpr));
  274. RefToExpr = RefToExpr.set(RefId, Access);
  275. }
  276. return RefToExpr.release();
  277. }
  278. /// Given a LLVM Type, compute its size in bytes,
  279. static int computeSizeInBytes(const Type *T) {
  280. int bytes = T->getPrimitiveSizeInBits() / 8;
  281. if (bytes == 0)
  282. bytes = T->getScalarSizeInBits() / 8;
  283. return bytes;
  284. }
  285. /// Generate code for a GPU specific isl AST.
  286. ///
  287. /// The GPUNodeBuilder augments the general existing IslNodeBuilder, which
  288. /// generates code for general-purpose AST nodes, with special functionality
  289. /// for generating GPU specific user nodes.
  290. ///
  291. /// @see GPUNodeBuilder::createUser
  292. class GPUNodeBuilder final : public IslNodeBuilder {
  293. public:
  294. GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator,
  295. const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE,
  296. DominatorTree &DT, Scop &S, BasicBlock *StartBlock,
  297. gpu_prog *Prog, GPURuntime Runtime, GPUArch Arch)
  298. : IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock),
  299. Prog(Prog), Runtime(Runtime), Arch(Arch) {
  300. getExprBuilder().setIDToSAI(&IDToSAI);
  301. }
  302. /// Create after-run-time-check initialization code.
  303. void initializeAfterRTH();
  304. /// Finalize the generated scop.
  305. void finalize() override;
  306. /// Track if the full build process was successful.
  307. ///
  308. /// This value is set to false, if throughout the build process an error
  309. /// occurred which prevents us from generating valid GPU code.
  310. bool BuildSuccessful = true;
  311. /// The maximal number of loops surrounding a sequential kernel.
  312. unsigned DeepestSequential = 0;
  313. /// The maximal number of loops surrounding a parallel kernel.
  314. unsigned DeepestParallel = 0;
  315. /// Return the name to set for the ptx_kernel.
  316. std::string getKernelFuncName(int Kernel_id);
  317. private:
  318. /// A vector of array base pointers for which a new ScopArrayInfo was created.
  319. ///
  320. /// This vector is used to delete the ScopArrayInfo when it is not needed any
  321. /// more.
  322. std::vector<Value *> LocalArrays;
  323. /// A map from ScopArrays to their corresponding device allocations.
  324. std::map<ScopArrayInfo *, Value *> DeviceAllocations;
  325. /// The current GPU context.
  326. Value *GPUContext;
  327. /// The set of isl_ids allocated in the kernel
  328. std::vector<isl_id *> KernelIds;
  329. /// A module containing GPU code.
  330. ///
  331. /// This pointer is only set in case we are currently generating GPU code.
  332. std::unique_ptr<Module> GPUModule;
  333. /// The GPU program we generate code for.
  334. gpu_prog *Prog;
  335. /// The GPU Runtime implementation to use (OpenCL or CUDA).
  336. GPURuntime Runtime;
  337. /// The GPU Architecture to target.
  338. GPUArch Arch;
  339. /// Class to free isl_ids.
  340. class IslIdDeleter final {
  341. public:
  342. void operator()(__isl_take isl_id *Id) { isl_id_free(Id); };
  343. };
  344. /// A set containing all isl_ids allocated in a GPU kernel.
  345. ///
  346. /// By releasing this set all isl_ids will be freed.
  347. std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs;
  348. IslExprBuilder::IDToScopArrayInfoTy IDToSAI;
  349. /// Create code for user-defined AST nodes.
  350. ///
  351. /// These AST nodes can be of type:
  352. ///
  353. /// - ScopStmt: A computational statement (TODO)
  354. /// - Kernel: A GPU kernel call (TODO)
  355. /// - Data-Transfer: A GPU <-> CPU data-transfer
  356. /// - In-kernel synchronization
  357. /// - In-kernel memory copy statement
  358. ///
  359. /// @param UserStmt The ast node to generate code for.
  360. void createUser(__isl_take isl_ast_node *UserStmt) override;
  361. void createFor(__isl_take isl_ast_node *Node) override;
  362. enum DataDirection { HOST_TO_DEVICE, DEVICE_TO_HOST };
  363. /// Create code for a data transfer statement
  364. ///
  365. /// @param TransferStmt The data transfer statement.
  366. /// @param Direction The direction in which to transfer data.
  367. void createDataTransfer(__isl_take isl_ast_node *TransferStmt,
  368. enum DataDirection Direction);
  369. /// Find llvm::Values referenced in GPU kernel.
  370. ///
  371. /// @param Kernel The kernel to scan for llvm::Values
  372. ///
  373. /// @returns A tuple, whose:
  374. /// - First element contains the set of values referenced by the
  375. /// kernel
  376. /// - Second element contains the set of functions referenced by the
  377. /// kernel. All functions in the set satisfy
  378. /// `isValidFunctionInKernel`.
  379. /// - Third element contains loops that have induction variables
  380. /// which are used in the kernel, *and* these loops are *neither*
  381. /// in the scop, nor do they immediately surroung the Scop.
  382. /// See [Code generation of induction variables of loops outside
  383. /// Scops]
  384. std::tuple<SetVector<Value *>, SetVector<Function *>, SetVector<const Loop *>,
  385. isl::space>
  386. getReferencesInKernel(ppcg_kernel *Kernel);
  387. /// Compute the sizes of the execution grid for a given kernel.
  388. ///
  389. /// @param Kernel The kernel to compute grid sizes for.
  390. ///
  391. /// @returns A tuple with grid sizes for X and Y dimension
  392. std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel);
  393. /// Get the managed array pointer for sending host pointers to the device.
  394. /// \note
  395. /// This is to be used only with managed memory
  396. Value *getManagedDeviceArray(gpu_array_info *Array, ScopArrayInfo *ArrayInfo);
  397. /// Compute the sizes of the thread blocks for a given kernel.
  398. ///
  399. /// @param Kernel The kernel to compute thread block sizes for.
  400. ///
  401. /// @returns A tuple with thread block sizes for X, Y, and Z dimensions.
  402. std::tuple<Value *, Value *, Value *> getBlockSizes(ppcg_kernel *Kernel);
  403. /// Store a specific kernel launch parameter in the array of kernel launch
  404. /// parameters.
  405. ///
  406. /// @param ArrayTy Array type of \p Parameters.
  407. /// @param Parameters The list of parameters in which to store.
  408. /// @param Param The kernel launch parameter to store.
  409. /// @param Index The index in the parameter list, at which to store the
  410. /// parameter.
  411. void insertStoreParameter(Type *ArrayTy, Instruction *Parameters,
  412. Instruction *Param, int Index);
  413. /// Create kernel launch parameters.
  414. ///
  415. /// @param Kernel The kernel to create parameters for.
  416. /// @param F The kernel function that has been created.
  417. /// @param SubtreeValues The set of llvm::Values referenced by this kernel.
  418. ///
  419. /// @returns A stack allocated array with pointers to the parameter
  420. /// values that are passed to the kernel.
  421. Value *createLaunchParameters(ppcg_kernel *Kernel, Function *F,
  422. SetVector<Value *> SubtreeValues);
  423. /// Create declarations for kernel variable.
  424. ///
  425. /// This includes shared memory declarations.
  426. ///
  427. /// @param Kernel The kernel definition to create variables for.
  428. /// @param FN The function into which to generate the variables.
  429. void createKernelVariables(ppcg_kernel *Kernel, Function *FN);
  430. /// Add CUDA annotations to module.
  431. ///
  432. /// Add a set of CUDA annotations that declares the maximal block dimensions
  433. /// that will be used to execute the CUDA kernel. This allows the NVIDIA
  434. /// PTX compiler to bound the number of allocated registers to ensure the
  435. /// resulting kernel is known to run with up to as many block dimensions
  436. /// as specified here.
  437. ///
  438. /// @param M The module to add the annotations to.
  439. /// @param BlockDimX The size of block dimension X.
  440. /// @param BlockDimY The size of block dimension Y.
  441. /// @param BlockDimZ The size of block dimension Z.
  442. void addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY,
  443. Value *BlockDimZ);
  444. /// Create GPU kernel.
  445. ///
  446. /// Code generate the kernel described by @p KernelStmt.
  447. ///
  448. /// @param KernelStmt The ast node to generate kernel code for.
  449. void createKernel(__isl_take isl_ast_node *KernelStmt);
  450. /// Generate code that computes the size of an array.
  451. ///
  452. /// @param Array The array for which to compute a size.
  453. Value *getArraySize(gpu_array_info *Array);
  454. /// Generate code to compute the minimal offset at which an array is accessed.
  455. ///
  456. /// The offset of an array is the minimal array location accessed in a scop.
  457. ///
  458. /// Example:
  459. ///
  460. /// for (long i = 0; i < 100; i++)
  461. /// A[i + 42] += ...
  462. ///
  463. /// getArrayOffset(A) results in 42.
  464. ///
  465. /// @param Array The array for which to compute the offset.
  466. /// @returns An llvm::Value that contains the offset of the array.
  467. Value *getArrayOffset(gpu_array_info *Array);
  468. /// Prepare the kernel arguments for kernel code generation
  469. ///
  470. /// @param Kernel The kernel to generate code for.
  471. /// @param FN The function created for the kernel.
  472. void prepareKernelArguments(ppcg_kernel *Kernel, Function *FN);
  473. /// Create kernel function.
  474. ///
  475. /// Create a kernel function located in a newly created module that can serve
  476. /// as target for device code generation. Set the Builder to point to the
  477. /// start block of this newly created function.
  478. ///
  479. /// @param Kernel The kernel to generate code for.
  480. /// @param SubtreeValues The set of llvm::Values referenced by this kernel.
  481. /// @param SubtreeFunctions The set of llvm::Functions referenced by this
  482. /// kernel.
  483. void createKernelFunction(ppcg_kernel *Kernel,
  484. SetVector<Value *> &SubtreeValues,
  485. SetVector<Function *> &SubtreeFunctions);
  486. /// Create the declaration of a kernel function.
  487. ///
  488. /// The kernel function takes as arguments:
  489. ///
  490. /// - One i8 pointer for each external array reference used in the kernel.
  491. /// - Host iterators
  492. /// - Parameters
  493. /// - Other LLVM Value references (TODO)
  494. ///
  495. /// @param Kernel The kernel to generate the function declaration for.
  496. /// @param SubtreeValues The set of llvm::Values referenced by this kernel.
  497. ///
  498. /// @returns The newly declared function.
  499. Function *createKernelFunctionDecl(ppcg_kernel *Kernel,
  500. SetVector<Value *> &SubtreeValues);
  501. /// Insert intrinsic functions to obtain thread and block ids.
  502. ///
  503. /// @param The kernel to generate the intrinsic functions for.
  504. void insertKernelIntrinsics(ppcg_kernel *Kernel);
  505. /// Insert function calls to retrieve the SPIR group/local ids.
  506. ///
  507. /// @param Kernel The kernel to generate the function calls for.
  508. /// @param SizeTypeIs64Bit Whether size_t of the openCl device is 64bit.
  509. void insertKernelCallsSPIR(ppcg_kernel *Kernel, bool SizeTypeIs64bit);
  510. /// Setup the creation of functions referenced by the GPU kernel.
  511. ///
  512. /// 1. Create new function declarations in GPUModule which are the same as
  513. /// SubtreeFunctions.
  514. ///
  515. /// 2. Populate IslNodeBuilder::ValueMap with mappings from
  516. /// old functions (that come from the original module) to new functions
  517. /// (that are created within GPUModule). That way, we generate references
  518. /// to the correct function (in GPUModule) in BlockGenerator.
  519. ///
  520. /// @see IslNodeBuilder::ValueMap
  521. /// @see BlockGenerator::GlobalMap
  522. /// @see BlockGenerator::getNewValue
  523. /// @see GPUNodeBuilder::getReferencesInKernel.
  524. ///
  525. /// @param SubtreeFunctions The set of llvm::Functions referenced by
  526. /// this kernel.
  527. void setupKernelSubtreeFunctions(SetVector<Function *> SubtreeFunctions);
  528. /// Create a global-to-shared or shared-to-global copy statement.
  529. ///
  530. /// @param CopyStmt The copy statement to generate code for
  531. void createKernelCopy(ppcg_kernel_stmt *CopyStmt);
  532. /// Create code for a ScopStmt called in @p Expr.
  533. ///
  534. /// @param Expr The expression containing the call.
  535. /// @param KernelStmt The kernel statement referenced in the call.
  536. void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt);
  537. /// Create an in-kernel synchronization call.
  538. void createKernelSync();
  539. /// Create a PTX assembly string for the current GPU kernel.
  540. ///
  541. /// @returns A string containing the corresponding PTX assembly code.
  542. std::string createKernelASM();
  543. /// Remove references from the dominator tree to the kernel function @p F.
  544. ///
  545. /// @param F The function to remove references to.
  546. void clearDominators(Function *F);
  547. /// Remove references from scalar evolution to the kernel function @p F.
  548. ///
  549. /// @param F The function to remove references to.
  550. void clearScalarEvolution(Function *F);
  551. /// Remove references from loop info to the kernel function @p F.
  552. ///
  553. /// @param F The function to remove references to.
  554. void clearLoops(Function *F);
  555. /// Check if the scop requires to be linked with CUDA's libdevice.
  556. bool requiresCUDALibDevice();
  557. /// Link with the NVIDIA libdevice library (if needed and available).
  558. void addCUDALibDevice();
  559. /// Finalize the generation of the kernel function.
  560. ///
  561. /// Free the LLVM-IR module corresponding to the kernel and -- if requested --
  562. /// dump its IR to stderr.
  563. ///
  564. /// @returns The Assembly string of the kernel.
  565. std::string finalizeKernelFunction();
  566. /// Finalize the generation of the kernel arguments.
  567. ///
  568. /// This function ensures that not-read-only scalars used in a kernel are
  569. /// stored back to the global memory location they are backed with before
  570. /// the kernel terminates.
  571. ///
  572. /// @params Kernel The kernel to finalize kernel arguments for.
  573. void finalizeKernelArguments(ppcg_kernel *Kernel);
  574. /// Create code that allocates memory to store arrays on device.
  575. void allocateDeviceArrays();
  576. /// Create code to prepare the managed device pointers.
  577. void prepareManagedDeviceArrays();
  578. /// Free all allocated device arrays.
  579. void freeDeviceArrays();
  580. /// Create a call to initialize the GPU context.
  581. ///
  582. /// @returns A pointer to the newly initialized context.
  583. Value *createCallInitContext();
  584. /// Create a call to get the device pointer for a kernel allocation.
  585. ///
  586. /// @param Allocation The Polly GPU allocation
  587. ///
  588. /// @returns The device parameter corresponding to this allocation.
  589. Value *createCallGetDevicePtr(Value *Allocation);
  590. /// Create a call to free the GPU context.
  591. ///
  592. /// @param Context A pointer to an initialized GPU context.
  593. void createCallFreeContext(Value *Context);
  594. /// Create a call to allocate memory on the device.
  595. ///
  596. /// @param Size The size of memory to allocate
  597. ///
  598. /// @returns A pointer that identifies this allocation.
  599. Value *createCallAllocateMemoryForDevice(Value *Size);
  600. /// Create a call to free a device array.
  601. ///
  602. /// @param Array The device array to free.
  603. void createCallFreeDeviceMemory(Value *Array);
  604. /// Create a call to copy data from host to device.
  605. ///
  606. /// @param HostPtr A pointer to the host data that should be copied.
  607. /// @param DevicePtr A device pointer specifying the location to copy to.
  608. void createCallCopyFromHostToDevice(Value *HostPtr, Value *DevicePtr,
  609. Value *Size);
  610. /// Create a call to copy data from device to host.
  611. ///
  612. /// @param DevicePtr A pointer to the device data that should be copied.
  613. /// @param HostPtr A host pointer specifying the location to copy to.
  614. void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr,
  615. Value *Size);
  616. /// Create a call to synchronize Host & Device.
  617. /// \note
  618. /// This is to be used only with managed memory.
  619. void createCallSynchronizeDevice();
  620. /// Create a call to get a kernel from an assembly string.
  621. ///
  622. /// @param Buffer The string describing the kernel.
  623. /// @param Entry The name of the kernel function to call.
  624. ///
  625. /// @returns A pointer to a kernel object
  626. Value *createCallGetKernel(Value *Buffer, Value *Entry);
  627. /// Create a call to free a GPU kernel.
  628. ///
  629. /// @param GPUKernel THe kernel to free.
  630. void createCallFreeKernel(Value *GPUKernel);
  631. /// Create a call to launch a GPU kernel.
  632. ///
  633. /// @param GPUKernel The kernel to launch.
  634. /// @param GridDimX The size of the first grid dimension.
  635. /// @param GridDimY The size of the second grid dimension.
  636. /// @param GridBlockX The size of the first block dimension.
  637. /// @param GridBlockY The size of the second block dimension.
  638. /// @param GridBlockZ The size of the third block dimension.
  639. /// @param Parameters A pointer to an array that contains itself pointers to
  640. /// the parameter values passed for each kernel argument.
  641. void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX,
  642. Value *GridDimY, Value *BlockDimX,
  643. Value *BlockDimY, Value *BlockDimZ,
  644. Value *Parameters);
  645. };
  646. std::string GPUNodeBuilder::getKernelFuncName(int Kernel_id) {
  647. return "FUNC_" + S.getFunction().getName().str() + "_SCOP_" +
  648. std::to_string(S.getID()) + "_KERNEL_" + std::to_string(Kernel_id);
  649. }
  650. void GPUNodeBuilder::initializeAfterRTH() {
  651. BasicBlock *NewBB = SplitBlock(Builder.GetInsertBlock(),
  652. &*Builder.GetInsertPoint(), &DT, &LI);
  653. NewBB->setName("polly.acc.initialize");
  654. Builder.SetInsertPoint(&NewBB->front());
  655. GPUContext = createCallInitContext();
  656. if (!PollyManagedMemory)
  657. allocateDeviceArrays();
  658. else
  659. prepareManagedDeviceArrays();
  660. }
  661. void GPUNodeBuilder::finalize() {
  662. if (!PollyManagedMemory)
  663. freeDeviceArrays();
  664. createCallFreeContext(GPUContext);
  665. IslNodeBuilder::finalize();
  666. }
  667. void GPUNodeBuilder::allocateDeviceArrays() {
  668. assert(!PollyManagedMemory &&
  669. "Managed memory will directly send host pointers "
  670. "to the kernel. There is no need for device arrays");
  671. isl_ast_build *Build = isl_ast_build_from_context(S.getContext().release());
  672. for (int i = 0; i < Prog->n_array; ++i) {
  673. gpu_array_info *Array = &Prog->array[i];
  674. auto *ScopArray = (ScopArrayInfo *)Array->user;
  675. std::string DevArrayName("p_dev_array_");
  676. DevArrayName.append(Array->name);
  677. Value *ArraySize = getArraySize(Array);
  678. Value *Offset = getArrayOffset(Array);
  679. if (Offset)
  680. ArraySize = Builder.CreateSub(
  681. ArraySize,
  682. Builder.CreateMul(Offset,
  683. Builder.getInt64(ScopArray->getElemSizeInBytes())));
  684. const SCEV *SizeSCEV = SE.getSCEV(ArraySize);
  685. // It makes no sense to have an array of size 0. The CUDA API will
  686. // throw an error anyway if we invoke `cuMallocManaged` with size `0`. We
  687. // choose to be defensive and catch this at the compile phase. It is
  688. // most likely that we are doing something wrong with size computation.
  689. if (SizeSCEV->isZero()) {
  690. errs() << getUniqueScopName(&S)
  691. << " has computed array size 0: " << *ArraySize
  692. << " | for array: " << *(ScopArray->getBasePtr())
  693. << ". This is illegal, exiting.\n";
  694. report_fatal_error("array size was computed to be 0");
  695. }
  696. Value *DevArray = createCallAllocateMemoryForDevice(ArraySize);
  697. DevArray->setName(DevArrayName);
  698. DeviceAllocations[ScopArray] = DevArray;
  699. }
  700. isl_ast_build_free(Build);
  701. }
  702. void GPUNodeBuilder::prepareManagedDeviceArrays() {
  703. assert(PollyManagedMemory &&
  704. "Device array most only be prepared in managed-memory mode");
  705. for (int i = 0; i < Prog->n_array; ++i) {
  706. gpu_array_info *Array = &Prog->array[i];
  707. ScopArrayInfo *ScopArray = (ScopArrayInfo *)Array->user;
  708. Value *HostPtr;
  709. if (gpu_array_is_scalar(Array))
  710. HostPtr = BlockGen.getOrCreateAlloca(ScopArray);
  711. else
  712. HostPtr = ScopArray->getBasePtr();
  713. HostPtr = getLatestValue(HostPtr);
  714. Value *Offset = getArrayOffset(Array);
  715. if (Offset) {
  716. HostPtr = Builder.CreatePointerCast(
  717. HostPtr, ScopArray->getElementType()->getPointerTo());
  718. HostPtr = Builder.CreateGEP(ScopArray->getElementType(), HostPtr, Offset);
  719. }
  720. HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());
  721. DeviceAllocations[ScopArray] = HostPtr;
  722. }
  723. }
  724. void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX,
  725. Value *BlockDimY, Value *BlockDimZ) {
  726. auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations");
  727. for (auto &F : *M) {
  728. if (F.getCallingConv() != CallingConv::PTX_Kernel)
  729. continue;
  730. Value *V[] = {BlockDimX, BlockDimY, BlockDimZ};
  731. Metadata *Elements[] = {
  732. ValueAsMetadata::get(&F), MDString::get(M->getContext(), "maxntidx"),
  733. ValueAsMetadata::get(V[0]), MDString::get(M->getContext(), "maxntidy"),
  734. ValueAsMetadata::get(V[1]), MDString::get(M->getContext(), "maxntidz"),
  735. ValueAsMetadata::get(V[2]),
  736. };
  737. MDNode *Node = MDNode::get(M->getContext(), Elements);
  738. AnnotationNode->addOperand(Node);
  739. }
  740. }
  741. void GPUNodeBuilder::freeDeviceArrays() {
  742. assert(!PollyManagedMemory && "Managed memory does not use device arrays");
  743. for (auto &Array : DeviceAllocations)
  744. createCallFreeDeviceMemory(Array.second);
  745. }
  746. Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) {
  747. const char *Name = "polly_getKernel";
  748. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  749. Function *F = M->getFunction(Name);
  750. // If F is not available, declare it.
  751. if (!F) {
  752. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  753. std::vector<Type *> Args;
  754. Args.push_back(Builder.getInt8PtrTy());
  755. Args.push_back(Builder.getInt8PtrTy());
  756. FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
  757. F = Function::Create(Ty, Linkage, Name, M);
  758. }
  759. return Builder.CreateCall(F, {Buffer, Entry});
  760. }
  761. Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) {
  762. const char *Name = "polly_getDevicePtr";
  763. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  764. Function *F = M->getFunction(Name);
  765. // If F is not available, declare it.
  766. if (!F) {
  767. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  768. std::vector<Type *> Args;
  769. Args.push_back(Builder.getInt8PtrTy());
  770. FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
  771. F = Function::Create(Ty, Linkage, Name, M);
  772. }
  773. return Builder.CreateCall(F, {Allocation});
  774. }
  775. void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX,
  776. Value *GridDimY, Value *BlockDimX,
  777. Value *BlockDimY, Value *BlockDimZ,
  778. Value *Parameters) {
  779. const char *Name = "polly_launchKernel";
  780. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  781. Function *F = M->getFunction(Name);
  782. // If F is not available, declare it.
  783. if (!F) {
  784. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  785. std::vector<Type *> Args;
  786. Args.push_back(Builder.getInt8PtrTy());
  787. Args.push_back(Builder.getInt32Ty());
  788. Args.push_back(Builder.getInt32Ty());
  789. Args.push_back(Builder.getInt32Ty());
  790. Args.push_back(Builder.getInt32Ty());
  791. Args.push_back(Builder.getInt32Ty());
  792. Args.push_back(Builder.getInt8PtrTy());
  793. FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
  794. F = Function::Create(Ty, Linkage, Name, M);
  795. }
  796. Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
  797. BlockDimZ, Parameters});
  798. }
  799. void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) {
  800. const char *Name = "polly_freeKernel";
  801. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  802. Function *F = M->getFunction(Name);
  803. // If F is not available, declare it.
  804. if (!F) {
  805. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  806. std::vector<Type *> Args;
  807. Args.push_back(Builder.getInt8PtrTy());
  808. FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
  809. F = Function::Create(Ty, Linkage, Name, M);
  810. }
  811. Builder.CreateCall(F, {GPUKernel});
  812. }
  813. void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) {
  814. assert(!PollyManagedMemory &&
  815. "Managed memory does not allocate or free memory "
  816. "for device");
  817. const char *Name = "polly_freeDeviceMemory";
  818. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  819. Function *F = M->getFunction(Name);
  820. // If F is not available, declare it.
  821. if (!F) {
  822. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  823. std::vector<Type *> Args;
  824. Args.push_back(Builder.getInt8PtrTy());
  825. FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
  826. F = Function::Create(Ty, Linkage, Name, M);
  827. }
  828. Builder.CreateCall(F, {Array});
  829. }
  830. Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) {
  831. assert(!PollyManagedMemory &&
  832. "Managed memory does not allocate or free memory "
  833. "for device");
  834. const char *Name = "polly_allocateMemoryForDevice";
  835. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  836. Function *F = M->getFunction(Name);
  837. // If F is not available, declare it.
  838. if (!F) {
  839. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  840. std::vector<Type *> Args;
  841. Args.push_back(Builder.getInt64Ty());
  842. FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
  843. F = Function::Create(Ty, Linkage, Name, M);
  844. }
  845. return Builder.CreateCall(F, {Size});
  846. }
  847. void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData,
  848. Value *DeviceData,
  849. Value *Size) {
  850. assert(!PollyManagedMemory &&
  851. "Managed memory does not transfer memory between "
  852. "device and host");
  853. const char *Name = "polly_copyFromHostToDevice";
  854. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  855. Function *F = M->getFunction(Name);
  856. // If F is not available, declare it.
  857. if (!F) {
  858. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  859. std::vector<Type *> Args;
  860. Args.push_back(Builder.getInt8PtrTy());
  861. Args.push_back(Builder.getInt8PtrTy());
  862. Args.push_back(Builder.getInt64Ty());
  863. FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
  864. F = Function::Create(Ty, Linkage, Name, M);
  865. }
  866. Builder.CreateCall(F, {HostData, DeviceData, Size});
  867. }
  868. void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData,
  869. Value *HostData,
  870. Value *Size) {
  871. assert(!PollyManagedMemory &&
  872. "Managed memory does not transfer memory between "
  873. "device and host");
  874. const char *Name = "polly_copyFromDeviceToHost";
  875. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  876. Function *F = M->getFunction(Name);
  877. // If F is not available, declare it.
  878. if (!F) {
  879. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  880. std::vector<Type *> Args;
  881. Args.push_back(Builder.getInt8PtrTy());
  882. Args.push_back(Builder.getInt8PtrTy());
  883. Args.push_back(Builder.getInt64Ty());
  884. FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
  885. F = Function::Create(Ty, Linkage, Name, M);
  886. }
  887. Builder.CreateCall(F, {DeviceData, HostData, Size});
  888. }
  889. void GPUNodeBuilder::createCallSynchronizeDevice() {
  890. assert(PollyManagedMemory && "explicit synchronization is only necessary for "
  891. "managed memory");
  892. const char *Name = "polly_synchronizeDevice";
  893. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  894. Function *F = M->getFunction(Name);
  895. // If F is not available, declare it.
  896. if (!F) {
  897. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  898. FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false);
  899. F = Function::Create(Ty, Linkage, Name, M);
  900. }
  901. Builder.CreateCall(F);
  902. }
  903. Value *GPUNodeBuilder::createCallInitContext() {
  904. const char *Name;
  905. switch (Runtime) {
  906. case GPURuntime::CUDA:
  907. Name = "polly_initContextCUDA";
  908. break;
  909. case GPURuntime::OpenCL:
  910. Name = "polly_initContextCL";
  911. break;
  912. }
  913. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  914. Function *F = M->getFunction(Name);
  915. // If F is not available, declare it.
  916. if (!F) {
  917. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  918. std::vector<Type *> Args;
  919. FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
  920. F = Function::Create(Ty, Linkage, Name, M);
  921. }
  922. return Builder.CreateCall(F, {});
  923. }
  924. void GPUNodeBuilder::createCallFreeContext(Value *Context) {
  925. const char *Name = "polly_freeContext";
  926. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  927. Function *F = M->getFunction(Name);
  928. // If F is not available, declare it.
  929. if (!F) {
  930. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  931. std::vector<Type *> Args;
  932. Args.push_back(Builder.getInt8PtrTy());
  933. FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
  934. F = Function::Create(Ty, Linkage, Name, M);
  935. }
  936. Builder.CreateCall(F, {Context});
  937. }
  938. /// Check if one string is a prefix of another.
  939. ///
  940. /// @param String The string in which to look for the prefix.
  941. /// @param Prefix The prefix to look for.
  942. static bool isPrefix(std::string String, std::string Prefix) {
  943. return String.find(Prefix) == 0;
  944. }
  945. Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) {
  946. isl::ast_build Build = isl::ast_build::from_context(S.getContext());
  947. Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size);
  948. if (!gpu_array_is_scalar(Array)) {
  949. isl::multi_pw_aff ArrayBound = isl::manage_copy(Array->bound);
  950. isl::pw_aff OffsetDimZero = ArrayBound.at(0);
  951. isl::ast_expr Res = Build.expr_from(OffsetDimZero);
  952. for (unsigned int i = 1; i < Array->n_index; i++) {
  953. isl::pw_aff Bound_I = ArrayBound.at(i);
  954. isl::ast_expr Expr = Build.expr_from(Bound_I);
  955. Res = Res.mul(Expr);
  956. }
  957. Value *NumElements = ExprBuilder.create(Res.release());
  958. if (NumElements->getType() != ArraySize->getType())
  959. NumElements = Builder.CreateSExt(NumElements, ArraySize->getType());
  960. ArraySize = Builder.CreateMul(ArraySize, NumElements);
  961. }
  962. return ArraySize;
  963. }
  964. Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) {
  965. if (gpu_array_is_scalar(Array))
  966. return nullptr;
  967. isl::ast_build Build = isl::ast_build::from_context(S.getContext());
  968. isl::set Min = isl::manage_copy(Array->extent).lexmin();
  969. isl::set ZeroSet = isl::set::universe(Min.get_space());
  970. for (unsigned i : rangeIslSize(0, Min.tuple_dim()))
  971. ZeroSet = ZeroSet.fix_si(isl::dim::set, i, 0);
  972. if (Min.is_subset(ZeroSet)) {
  973. return nullptr;
  974. }
  975. isl::ast_expr Result = isl::ast_expr::from_val(isl::val(Min.ctx(), 0));
  976. for (unsigned i : rangeIslSize(0, Min.tuple_dim())) {
  977. if (i > 0) {
  978. isl::pw_aff Bound_I =
  979. isl::manage(isl_multi_pw_aff_get_pw_aff(Array->bound, i - 1));
  980. isl::ast_expr BExpr = Build.expr_from(Bound_I);
  981. Result = Result.mul(BExpr);
  982. }
  983. isl::pw_aff DimMin = Min.dim_min(i);
  984. isl::ast_expr MExpr = Build.expr_from(DimMin);
  985. Result = Result.add(MExpr);
  986. }
  987. return ExprBuilder.create(Result.release());
  988. }
  989. Value *GPUNodeBuilder::getManagedDeviceArray(gpu_array_info *Array,
  990. ScopArrayInfo *ArrayInfo) {
  991. assert(PollyManagedMemory && "Only used when you wish to get a host "
  992. "pointer for sending data to the kernel, "
  993. "with managed memory");
  994. std::map<ScopArrayInfo *, Value *>::iterator it;
  995. it = DeviceAllocations.find(ArrayInfo);
  996. assert(it != DeviceAllocations.end() &&
  997. "Device array expected to be available");
  998. return it->second;
  999. }
  1000. void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt,
  1001. enum DataDirection Direction) {
  1002. assert(!PollyManagedMemory && "Managed memory needs no data transfers");
  1003. isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt);
  1004. isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0);
  1005. isl_id *Id = isl_ast_expr_get_id(Arg);
  1006. auto Array = (gpu_array_info *)isl_id_get_user(Id);
  1007. auto ScopArray = (ScopArrayInfo *)(Array->user);
  1008. Value *Size = getArraySize(Array);
  1009. Value *Offset = getArrayOffset(Array);
  1010. Value *DevPtr = DeviceAllocations[ScopArray];
  1011. Value *HostPtr;
  1012. if (gpu_array_is_scalar(Array))
  1013. HostPtr = BlockGen.getOrCreateAlloca(ScopArray);
  1014. else
  1015. HostPtr = ScopArray->getBasePtr();
  1016. HostPtr = getLatestValue(HostPtr);
  1017. if (Offset) {
  1018. HostPtr = Builder.CreatePointerCast(
  1019. HostPtr, ScopArray->getElementType()->getPointerTo());
  1020. HostPtr = Builder.CreateGEP(ScopArray->getElementType(), HostPtr, Offset);
  1021. }
  1022. HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());
  1023. if (Offset) {
  1024. Size = Builder.CreateSub(
  1025. Size, Builder.CreateMul(
  1026. Offset, Builder.getInt64(ScopArray->getElemSizeInBytes())));
  1027. }
  1028. if (Direction == HOST_TO_DEVICE)
  1029. createCallCopyFromHostToDevice(HostPtr, DevPtr, Size);
  1030. else
  1031. createCallCopyFromDeviceToHost(DevPtr, HostPtr, Size);
  1032. isl_id_free(Id);
  1033. isl_ast_expr_free(Arg);
  1034. isl_ast_expr_free(Expr);
  1035. isl_ast_node_free(TransferStmt);
  1036. }
  1037. void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) {
  1038. isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt);
  1039. isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0);
  1040. isl_id *Id = isl_ast_expr_get_id(StmtExpr);
  1041. isl_id_free(Id);
  1042. isl_ast_expr_free(StmtExpr);
  1043. const char *Str = isl_id_get_name(Id);
  1044. if (!strcmp(Str, "kernel")) {
  1045. createKernel(UserStmt);
  1046. if (PollyManagedMemory)
  1047. createCallSynchronizeDevice();
  1048. isl_ast_expr_free(Expr);
  1049. return;
  1050. }
  1051. if (!strcmp(Str, "init_device")) {
  1052. initializeAfterRTH();
  1053. isl_ast_node_free(UserStmt);
  1054. isl_ast_expr_free(Expr);
  1055. return;
  1056. }
  1057. if (!strcmp(Str, "clear_device")) {
  1058. finalize();
  1059. isl_ast_node_free(UserStmt);
  1060. isl_ast_expr_free(Expr);
  1061. return;
  1062. }
  1063. if (isPrefix(Str, "to_device")) {
  1064. if (!PollyManagedMemory)
  1065. createDataTransfer(UserStmt, HOST_TO_DEVICE);
  1066. else
  1067. isl_ast_node_free(UserStmt);
  1068. isl_ast_expr_free(Expr);
  1069. return;
  1070. }
  1071. if (isPrefix(Str, "from_device")) {
  1072. if (!PollyManagedMemory) {
  1073. createDataTransfer(UserStmt, DEVICE_TO_HOST);
  1074. } else {
  1075. isl_ast_node_free(UserStmt);
  1076. }
  1077. isl_ast_expr_free(Expr);
  1078. return;
  1079. }
  1080. isl_id *Anno = isl_ast_node_get_annotation(UserStmt);
  1081. struct ppcg_kernel_stmt *KernelStmt =
  1082. (struct ppcg_kernel_stmt *)isl_id_get_user(Anno);
  1083. isl_id_free(Anno);
  1084. switch (KernelStmt->type) {
  1085. case ppcg_kernel_domain:
  1086. createScopStmt(Expr, KernelStmt);
  1087. isl_ast_node_free(UserStmt);
  1088. return;
  1089. case ppcg_kernel_copy:
  1090. createKernelCopy(KernelStmt);
  1091. isl_ast_expr_free(Expr);
  1092. isl_ast_node_free(UserStmt);
  1093. return;
  1094. case ppcg_kernel_sync:
  1095. createKernelSync();
  1096. isl_ast_expr_free(Expr);
  1097. isl_ast_node_free(UserStmt);
  1098. return;
  1099. }
  1100. isl_ast_expr_free(Expr);
  1101. isl_ast_node_free(UserStmt);
  1102. }
  1103. void GPUNodeBuilder::createFor(__isl_take isl_ast_node *Node) {
  1104. createForSequential(isl::manage(Node).as<isl::ast_node_for>(), false);
  1105. }
  1106. void GPUNodeBuilder::createKernelCopy(ppcg_kernel_stmt *KernelStmt) {
  1107. isl_ast_expr *LocalIndex = isl_ast_expr_copy(KernelStmt->u.c.local_index);
  1108. auto LocalAddr = ExprBuilder.createAccessAddress(LocalIndex);
  1109. isl_ast_expr *Index = isl_ast_expr_copy(KernelStmt->u.c.index);
  1110. auto GlobalAddr = ExprBuilder.createAccessAddress(Index);
  1111. if (KernelStmt->u.c.read) {
  1112. LoadInst *Load =
  1113. Builder.CreateLoad(GlobalAddr.second, GlobalAddr.first, "shared.read");
  1114. Builder.CreateStore(Load, LocalAddr.first);
  1115. } else {
  1116. LoadInst *Load =
  1117. Builder.CreateLoad(LocalAddr.second, LocalAddr.first, "shared.write");
  1118. Builder.CreateStore(Load, GlobalAddr.first);
  1119. }
  1120. }
  1121. void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr,
  1122. ppcg_kernel_stmt *KernelStmt) {
  1123. auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt;
  1124. isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr;
  1125. LoopToScevMapT LTS;
  1126. LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end());
  1127. createSubstitutions(Expr, Stmt, LTS);
  1128. if (Stmt->isBlockStmt())
  1129. BlockGen.copyStmt(*Stmt, LTS, Indexes);
  1130. else
  1131. RegionGen.copyStmt(*Stmt, LTS, Indexes);
  1132. }
  1133. void GPUNodeBuilder::createKernelSync() {
  1134. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  1135. const char *SpirName = "__gen_ocl_barrier_global";
  1136. Function *Sync;
  1137. switch (Arch) {
  1138. case GPUArch::SPIR64:
  1139. case GPUArch::SPIR32:
  1140. Sync = M->getFunction(SpirName);
  1141. // If Sync is not available, declare it.
  1142. if (!Sync) {
  1143. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  1144. std::vector<Type *> Args;
  1145. FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
  1146. Sync = Function::Create(Ty, Linkage, SpirName, M);
  1147. Sync->setCallingConv(CallingConv::SPIR_FUNC);
  1148. }
  1149. break;
  1150. case GPUArch::NVPTX64:
  1151. Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
  1152. break;
  1153. }
  1154. Builder.CreateCall(Sync, {});
  1155. }
  1156. /// Collect llvm::Values referenced from @p Node
  1157. ///
  1158. /// This function only applies to isl_ast_nodes that are user_nodes referring
  1159. /// to a ScopStmt. All other node types are ignore.
  1160. ///
  1161. /// @param Node The node to collect references for.
  1162. /// @param User A user pointer used as storage for the data that is collected.
  1163. ///
  1164. /// @returns isl_bool_true if data could be collected successfully.
  1165. isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) {
  1166. if (isl_ast_node_get_type(Node) != isl_ast_node_user)
  1167. return isl_bool_true;
  1168. isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node);
  1169. isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0);
  1170. isl_id *Id = isl_ast_expr_get_id(StmtExpr);
  1171. const char *Str = isl_id_get_name(Id);
  1172. isl_id_free(Id);
  1173. isl_ast_expr_free(StmtExpr);
  1174. isl_ast_expr_free(Expr);
  1175. if (!isPrefix(Str, "Stmt"))
  1176. return isl_bool_true;
  1177. Id = isl_ast_node_get_annotation(Node);
  1178. auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id);
  1179. auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt;
  1180. isl_id_free(Id);
  1181. addReferencesFromStmt(Stmt, User, false /* CreateScalarRefs */);
  1182. return isl_bool_true;
  1183. }
  1184. /// A list of functions that are available in NVIDIA's libdevice.
  1185. const std::set<std::string> CUDALibDeviceFunctions = {
  1186. "exp", "expf", "expl", "cos", "cosf", "sqrt", "sqrtf",
  1187. "copysign", "copysignf", "copysignl", "log", "logf", "powi", "powif"};
  1188. // A map from intrinsics to their corresponding libdevice functions.
  1189. const std::map<std::string, std::string> IntrinsicToLibdeviceFunc = {
  1190. {"llvm.exp.f64", "exp"},
  1191. {"llvm.exp.f32", "expf"},
  1192. {"llvm.powi.f64.i32", "powi"},
  1193. {"llvm.powi.f32.i32", "powif"}};
  1194. /// Return the corresponding CUDA libdevice function name @p Name.
  1195. /// Note that this function will try to convert instrinsics in the list
  1196. /// IntrinsicToLibdeviceFunc into libdevice functions.
  1197. /// This is because some intrinsics such as `exp`
  1198. /// are not supported by the NVPTX backend.
  1199. /// If this restriction of the backend is lifted, we should refactor our code
  1200. /// so that we use intrinsics whenever possible.
  1201. ///
  1202. /// Return "" if we are not compiling for CUDA.
  1203. std::string getCUDALibDeviceFuntion(StringRef NameRef) {
  1204. std::string Name = NameRef.str();
  1205. auto It = IntrinsicToLibdeviceFunc.find(Name);
  1206. if (It != IntrinsicToLibdeviceFunc.end())
  1207. return getCUDALibDeviceFuntion(It->second);
  1208. if (CUDALibDeviceFunctions.count(Name))
  1209. return ("__nv_" + Name);
  1210. return "";
  1211. }
  1212. /// Check if F is a function that we can code-generate in a GPU kernel.
  1213. static bool isValidFunctionInKernel(llvm::Function *F, bool AllowLibDevice) {
  1214. assert(F && "F is an invalid pointer");
  1215. // We string compare against the name of the function to allow
  1216. // all variants of the intrinsic "llvm.sqrt.*", "llvm.fabs", and
  1217. // "llvm.copysign".
  1218. const StringRef Name = F->getName();
  1219. if (AllowLibDevice && getCUDALibDeviceFuntion(Name).length() > 0)
  1220. return true;
  1221. return F->isIntrinsic() &&
  1222. (Name.startswith("llvm.sqrt") || Name.startswith("llvm.fabs") ||
  1223. Name.startswith("llvm.copysign"));
  1224. }
  1225. /// Do not take `Function` as a subtree value.
  1226. ///
  1227. /// We try to take the reference of all subtree values and pass them along
  1228. /// to the kernel from the host. Taking an address of any function and
  1229. /// trying to pass along is nonsensical. Only allow `Value`s that are not
  1230. /// `Function`s.
  1231. static bool isValidSubtreeValue(llvm::Value *V) { return !isa<Function>(V); }
  1232. /// Return `Function`s from `RawSubtreeValues`.
  1233. static SetVector<Function *>
  1234. getFunctionsFromRawSubtreeValues(SetVector<Value *> RawSubtreeValues,
  1235. bool AllowCUDALibDevice) {
  1236. SetVector<Function *> SubtreeFunctions;
  1237. for (Value *It : RawSubtreeValues) {
  1238. Function *F = dyn_cast<Function>(It);
  1239. if (F) {
  1240. assert(isValidFunctionInKernel(F, AllowCUDALibDevice) &&
  1241. "Code should have bailed out by "
  1242. "this point if an invalid function "
  1243. "were present in a kernel.");
  1244. SubtreeFunctions.insert(F);
  1245. }
  1246. }
  1247. return SubtreeFunctions;
  1248. }
  1249. std::tuple<SetVector<Value *>, SetVector<Function *>, SetVector<const Loop *>,
  1250. isl::space>
  1251. GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) {
  1252. SetVector<Value *> SubtreeValues;
  1253. SetVector<const SCEV *> SCEVs;
  1254. SetVector<const Loop *> Loops;
  1255. isl::space ParamSpace = isl::space(S.getIslCtx(), 0, 0).params();
  1256. SubtreeReferences References = {
  1257. LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator(),
  1258. &ParamSpace};
  1259. for (const auto &I : IDToValue)
  1260. SubtreeValues.insert(I.second);
  1261. // NOTE: this is populated in IslNodeBuilder::addParameters
  1262. // See [Code generation of induction variables of loops outside Scops].
  1263. for (const auto &I : OutsideLoopIterations)
  1264. SubtreeValues.insert(cast<SCEVUnknown>(I.second)->getValue());
  1265. isl_ast_node_foreach_descendant_top_down(
  1266. Kernel->tree, collectReferencesInGPUStmt, &References);
  1267. for (const SCEV *Expr : SCEVs) {
  1268. findValues(Expr, SE, SubtreeValues);
  1269. findLoops(Expr, Loops);
  1270. }
  1271. Loops.remove_if([this](const Loop *L) {
  1272. return S.contains(L) || L->contains(S.getEntry());
  1273. });
  1274. for (auto &SAI : S.arrays())
  1275. SubtreeValues.remove(SAI->getBasePtr());
  1276. isl_space *Space = S.getParamSpace().release();
  1277. for (long i = 0, n = isl_space_dim(Space, isl_dim_param); i < n; i++) {
  1278. isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i);
  1279. assert(IDToValue.count(Id));
  1280. Value *Val = IDToValue[Id];
  1281. SubtreeValues.remove(Val);
  1282. isl_id_free(Id);
  1283. }
  1284. isl_space_free(Space);
  1285. for (long i = 0, n = isl_space_dim(Kernel->space, isl_dim_set); i < n; i++) {
  1286. isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
  1287. assert(IDToValue.count(Id));
  1288. Value *Val = IDToValue[Id];
  1289. SubtreeValues.remove(Val);
  1290. isl_id_free(Id);
  1291. }
  1292. // Note: { ValidSubtreeValues, ValidSubtreeFunctions } partitions
  1293. // SubtreeValues. This is important, because we should not lose any
  1294. // SubtreeValues in the process of constructing the
  1295. // "ValidSubtree{Values, Functions} sets. Nor should the set
  1296. // ValidSubtree{Values, Functions} have any common element.
  1297. auto ValidSubtreeValuesIt =
  1298. make_filter_range(SubtreeValues, isValidSubtreeValue);
  1299. SetVector<Value *> ValidSubtreeValues(ValidSubtreeValuesIt.begin(),
  1300. ValidSubtreeValuesIt.end());
  1301. bool AllowCUDALibDevice = Arch == GPUArch::NVPTX64;
  1302. SetVector<Function *> ValidSubtreeFunctions(
  1303. getFunctionsFromRawSubtreeValues(SubtreeValues, AllowCUDALibDevice));
  1304. // @see IslNodeBuilder::getReferencesInSubtree
  1305. SetVector<Value *> ReplacedValues;
  1306. for (Value *V : ValidSubtreeValues) {
  1307. auto It = ValueMap.find(V);
  1308. if (It == ValueMap.end())
  1309. ReplacedValues.insert(V);
  1310. else
  1311. ReplacedValues.insert(It->second);
  1312. }
  1313. return std::make_tuple(ReplacedValues, ValidSubtreeFunctions, Loops,
  1314. ParamSpace);
  1315. }
  1316. void GPUNodeBuilder::clearDominators(Function *F) {
  1317. DomTreeNode *N = DT.getNode(&F->getEntryBlock());
  1318. std::vector<BasicBlock *> Nodes;
  1319. for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I)
  1320. Nodes.push_back(I->getBlock());
  1321. for (BasicBlock *BB : Nodes)
  1322. DT.eraseNode(BB);
  1323. }
  1324. void GPUNodeBuilder::clearScalarEvolution(Function *F) {
  1325. for (BasicBlock &BB : *F) {
  1326. Loop *L = LI.getLoopFor(&BB);
  1327. if (L)
  1328. SE.forgetLoop(L);
  1329. }
  1330. }
  1331. void GPUNodeBuilder::clearLoops(Function *F) {
  1332. SmallSet<Loop *, 1> WorkList;
  1333. for (BasicBlock &BB : *F) {
  1334. Loop *L = LI.getLoopFor(&BB);
  1335. if (L)
  1336. WorkList.insert(L);
  1337. }
  1338. for (auto *L : WorkList)
  1339. LI.erase(L);
  1340. }
  1341. std::tuple<Value *, Value *> GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) {
  1342. std::vector<Value *> Sizes;
  1343. isl::ast_build Context = isl::ast_build::from_context(S.getContext());
  1344. isl::multi_pw_aff GridSizePwAffs = isl::manage_copy(Kernel->grid_size);
  1345. for (long i = 0; i < Kernel->n_grid; i++) {
  1346. isl::pw_aff Size = GridSizePwAffs.at(i);
  1347. isl::ast_expr GridSize = Context.expr_from(Size);
  1348. Value *Res = ExprBuilder.create(GridSize.release());
  1349. Res = Builder.CreateTrunc(Res, Builder.getInt32Ty());
  1350. Sizes.push_back(Res);
  1351. }
  1352. for (long i = Kernel->n_grid; i < 3; i++)
  1353. Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1));
  1354. return std::make_tuple(Sizes[0], Sizes[1]);
  1355. }
  1356. std::tuple<Value *, Value *, Value *>
  1357. GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) {
  1358. std::vector<Value *> Sizes;
  1359. for (long i = 0; i < Kernel->n_block; i++) {
  1360. Value *Res = ConstantInt::get(Builder.getInt32Ty(), Kernel->block_dim[i]);
  1361. Sizes.push_back(Res);
  1362. }
  1363. for (long i = Kernel->n_block; i < 3; i++)
  1364. Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1));
  1365. return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]);
  1366. }
  1367. void GPUNodeBuilder::insertStoreParameter(Type *ArrayTy,
  1368. Instruction *Parameters,
  1369. Instruction *Param, int Index) {
  1370. Value *Slot = Builder.CreateGEP(
  1371. ArrayTy, Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
  1372. Value *ParamTyped = Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
  1373. Builder.CreateStore(ParamTyped, Slot);
  1374. }
  1375. Value *
  1376. GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F,
  1377. SetVector<Value *> SubtreeValues) {
  1378. const int NumArgs = F->arg_size();
  1379. std::vector<int> ArgSizes(NumArgs);
  1380. // If we are using the OpenCL Runtime, we need to add the kernel argument
  1381. // sizes to the end of the launch-parameter list, so OpenCL can determine
  1382. // how big the respective kernel arguments are.
  1383. // Here we need to reserve adequate space for that.
  1384. Type *ArrayTy;
  1385. if (Runtime == GPURuntime::OpenCL)
  1386. ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 2 * NumArgs);
  1387. else
  1388. ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), NumArgs);
  1389. BasicBlock *EntryBlock =
  1390. &Builder.GetInsertBlock()->getParent()->getEntryBlock();
  1391. auto AddressSpace = F->getParent()->getDataLayout().getAllocaAddrSpace();
  1392. std::string Launch = "polly_launch_" + std::to_string(Kernel->id);
  1393. Instruction *Parameters = new AllocaInst(
  1394. ArrayTy, AddressSpace, Launch + "_params", EntryBlock->getTerminator());
  1395. int Index = 0;
  1396. for (long i = 0; i < Prog->n_array; i++) {
  1397. if (!ppcg_kernel_requires_array_argument(Kernel, i))
  1398. continue;
  1399. isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
  1400. const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage(Id));
  1401. if (Runtime == GPURuntime::OpenCL)
  1402. ArgSizes[Index] = SAI->getElemSizeInBytes();
  1403. Value *DevArray = nullptr;
  1404. if (PollyManagedMemory) {
  1405. DevArray = getManagedDeviceArray(&Prog->array[i],
  1406. const_cast<ScopArrayInfo *>(SAI));
  1407. } else {
  1408. DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)];
  1409. DevArray = createCallGetDevicePtr(DevArray);
  1410. }
  1411. assert(DevArray != nullptr && "Array to be offloaded to device not "
  1412. "initialized");
  1413. Value *Offset = getArrayOffset(&Prog->array[i]);
  1414. if (Offset) {
  1415. DevArray = Builder.CreatePointerCast(
  1416. DevArray, SAI->getElementType()->getPointerTo());
  1417. DevArray = Builder.CreateGEP(SAI->getElementType(), DevArray,
  1418. Builder.CreateNeg(Offset));
  1419. DevArray = Builder.CreatePointerCast(DevArray, Builder.getInt8PtrTy());
  1420. }
  1421. Value *Slot = Builder.CreateGEP(
  1422. ArrayTy, Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
  1423. if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
  1424. Value *ValPtr = nullptr;
  1425. if (PollyManagedMemory)
  1426. ValPtr = DevArray;
  1427. else
  1428. ValPtr = BlockGen.getOrCreateAlloca(SAI);
  1429. assert(ValPtr != nullptr && "ValPtr that should point to a valid object"
  1430. " to be stored into Parameters");
  1431. Value *ValPtrCast =
  1432. Builder.CreatePointerCast(ValPtr, Builder.getInt8PtrTy());
  1433. Builder.CreateStore(ValPtrCast, Slot);
  1434. } else {
  1435. Instruction *Param =
  1436. new AllocaInst(Builder.getInt8PtrTy(), AddressSpace,
  1437. Launch + "_param_" + std::to_string(Index),
  1438. EntryBlock->getTerminator());
  1439. Builder.CreateStore(DevArray, Param);
  1440. Value *ParamTyped =
  1441. Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
  1442. Builder.CreateStore(ParamTyped, Slot);
  1443. }
  1444. Index++;
  1445. }
  1446. int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set);
  1447. for (long i = 0; i < NumHostIters; i++) {
  1448. isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
  1449. Value *Val = IDToValue[Id];
  1450. isl_id_free(Id);
  1451. if (Runtime == GPURuntime::OpenCL)
  1452. ArgSizes[Index] = computeSizeInBytes(Val->getType());
  1453. Instruction *Param =
  1454. new AllocaInst(Val->getType(), AddressSpace,
  1455. Launch + "_param_" + std::to_string(Index),
  1456. EntryBlock->getTerminator());
  1457. Builder.CreateStore(Val, Param);
  1458. insertStoreParameter(ArrayTy, Parameters, Param, Index);
  1459. Index++;
  1460. }
  1461. int NumVars = isl_space_dim(Kernel->space, isl_dim_param);
  1462. for (long i = 0; i < NumVars; i++) {
  1463. isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i);
  1464. Value *Val = IDToValue[Id];
  1465. if (ValueMap.count(Val))
  1466. Val = ValueMap[Val];
  1467. isl_id_free(Id);
  1468. if (Runtime == GPURuntime::OpenCL)
  1469. ArgSizes[Index] = computeSizeInBytes(Val->getType());
  1470. Instruction *Param =
  1471. new AllocaInst(Val->getType(), AddressSpace,
  1472. Launch + "_param_" + std::to_string(Index),
  1473. EntryBlock->getTerminator());
  1474. Builder.CreateStore(Val, Param);
  1475. insertStoreParameter(ArrayTy, Parameters, Param, Index);
  1476. Index++;
  1477. }
  1478. for (auto Val : SubtreeValues) {
  1479. if (Runtime == GPURuntime::OpenCL)
  1480. ArgSizes[Index] = computeSizeInBytes(Val->getType());
  1481. Instruction *Param =
  1482. new AllocaInst(Val->getType(), AddressSpace,
  1483. Launch + "_param_" + std::to_string(Index),
  1484. EntryBlock->getTerminator());
  1485. Builder.CreateStore(Val, Param);
  1486. insertStoreParameter(ArrayTy, Parameters, Param, Index);
  1487. Index++;
  1488. }
  1489. if (Runtime == GPURuntime::OpenCL) {
  1490. for (int i = 0; i < NumArgs; i++) {
  1491. Value *Val = ConstantInt::get(Builder.getInt32Ty(), ArgSizes[i]);
  1492. Instruction *Param =
  1493. new AllocaInst(Builder.getInt32Ty(), AddressSpace,
  1494. Launch + "_param_size_" + std::to_string(i),
  1495. EntryBlock->getTerminator());
  1496. Builder.CreateStore(Val, Param);
  1497. insertStoreParameter(ArrayTy, Parameters, Param, Index);
  1498. Index++;
  1499. }
  1500. }
  1501. auto Location = EntryBlock->getTerminator();
  1502. return new BitCastInst(Parameters, Builder.getInt8PtrTy(),
  1503. Launch + "_params_i8ptr", Location);
  1504. }
  1505. void GPUNodeBuilder::setupKernelSubtreeFunctions(
  1506. SetVector<Function *> SubtreeFunctions) {
  1507. for (auto Fn : SubtreeFunctions) {
  1508. const std::string ClonedFnName = Fn->getName().str();
  1509. Function *Clone = GPUModule->getFunction(ClonedFnName);
  1510. if (!Clone)
  1511. Clone =
  1512. Function::Create(Fn->getFunctionType(), GlobalValue::ExternalLinkage,
  1513. ClonedFnName, GPUModule.get());
  1514. assert(Clone && "Expected cloned function to be initialized.");
  1515. assert(ValueMap.find(Fn) == ValueMap.end() &&
  1516. "Fn already present in ValueMap");
  1517. ValueMap[Fn] = Clone;
  1518. }
  1519. }
  1520. void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
  1521. isl_id *Id = isl_ast_node_get_annotation(KernelStmt);
  1522. ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id);
  1523. isl_id_free(Id);
  1524. isl_ast_node_free(KernelStmt);
  1525. if (Kernel->n_grid > 1)
  1526. DeepestParallel = std::max(
  1527. DeepestParallel, (unsigned)isl_space_dim(Kernel->space, isl_dim_set));
  1528. else
  1529. DeepestSequential = std::max(
  1530. DeepestSequential, (unsigned)isl_space_dim(Kernel->space, isl_dim_set));
  1531. Value *BlockDimX, *BlockDimY, *BlockDimZ;
  1532. std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel);
  1533. SetVector<Value *> SubtreeValues;
  1534. SetVector<Function *> SubtreeFunctions;
  1535. SetVector<const Loop *> Loops;
  1536. isl::space ParamSpace;
  1537. std::tie(SubtreeValues, SubtreeFunctions, Loops, ParamSpace) =
  1538. getReferencesInKernel(Kernel);
  1539. // Add parameters that appear only in the access function to the kernel
  1540. // space. This is important to make sure that all isl_ids are passed as
  1541. // parameters to the kernel, even though we may not have all parameters
  1542. // in the context to improve compile time.
  1543. Kernel->space = isl_space_align_params(Kernel->space, ParamSpace.release());
  1544. assert(Kernel->tree && "Device AST of kernel node is empty");
  1545. Instruction &HostInsertPoint = *Builder.GetInsertPoint();
  1546. IslExprBuilder::IDToValueTy HostIDs = IDToValue;
  1547. ValueMapT HostValueMap = ValueMap;
  1548. BlockGenerator::AllocaMapTy HostScalarMap = ScalarMap;
  1549. ScalarMap.clear();
  1550. BlockGenerator::EscapeUsersAllocaMapTy HostEscapeMap = EscapeMap;
  1551. EscapeMap.clear();
  1552. // Create for all loops we depend on values that contain the current loop
  1553. // iteration. These values are necessary to generate code for SCEVs that
  1554. // depend on such loops. As a result we need to pass them to the subfunction.
  1555. for (const Loop *L : Loops) {
  1556. const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)),
  1557. SE.getUnknown(Builder.getInt64(1)),
  1558. L, SCEV::FlagAnyWrap);
  1559. Value *V = generateSCEV(OuterLIV);
  1560. OutsideLoopIterations[L] = SE.getUnknown(V);
  1561. SubtreeValues.insert(V);
  1562. }
  1563. createKernelFunction(Kernel, SubtreeValues, SubtreeFunctions);
  1564. setupKernelSubtreeFunctions(SubtreeFunctions);
  1565. create(isl_ast_node_copy(Kernel->tree));
  1566. finalizeKernelArguments(Kernel);
  1567. Function *F = Builder.GetInsertBlock()->getParent();
  1568. if (Arch == GPUArch::NVPTX64)
  1569. addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ);
  1570. clearDominators(F);
  1571. clearScalarEvolution(F);
  1572. clearLoops(F);
  1573. IDToValue = HostIDs;
  1574. ValueMap = std::move(HostValueMap);
  1575. ScalarMap = std::move(HostScalarMap);
  1576. EscapeMap = std::move(HostEscapeMap);
  1577. IDToSAI.clear();
  1578. Annotator.resetAlternativeAliasBases();
  1579. for (auto &BasePtr : LocalArrays)
  1580. S.invalidateScopArrayInfo(BasePtr, MemoryKind::Array);
  1581. LocalArrays.clear();
  1582. std::string ASMString = finalizeKernelFunction();
  1583. Builder.SetInsertPoint(&HostInsertPoint);
  1584. Value *Parameters = createLaunchParameters(Kernel, F, SubtreeValues);
  1585. std::string Name = getKernelFuncName(Kernel->id);
  1586. Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name);
  1587. Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name");
  1588. Value *GPUKernel = createCallGetKernel(KernelString, NameString);
  1589. Value *GridDimX, *GridDimY;
  1590. std::tie(GridDimX, GridDimY) = getGridSizes(Kernel);
  1591. createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
  1592. BlockDimZ, Parameters);
  1593. createCallFreeKernel(GPUKernel);
  1594. for (auto Id : KernelIds)
  1595. isl_id_free(Id);
  1596. KernelIds.clear();
  1597. }
  1598. /// Compute the DataLayout string for the NVPTX backend.
  1599. ///
  1600. /// @param is64Bit Are we looking for a 64 bit architecture?
  1601. static std::string computeNVPTXDataLayout(bool is64Bit) {
  1602. std::string Ret = "";
  1603. if (!is64Bit) {
  1604. Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
  1605. "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:"
  1606. "64-v128:128:128-n16:32:64";
  1607. } else {
  1608. Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
  1609. "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:"
  1610. "64-v128:128:128-n16:32:64";
  1611. }
  1612. return Ret;
  1613. }
  1614. /// Compute the DataLayout string for a SPIR kernel.
  1615. ///
  1616. /// @param is64Bit Are we looking for a 64 bit architecture?
  1617. static std::string computeSPIRDataLayout(bool is64Bit) {
  1618. std::string Ret = "";
  1619. if (!is64Bit) {
  1620. Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
  1621. "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:"
  1622. "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:"
  1623. "256:256-v256:256:256-v512:512:512-v1024:1024:1024";
  1624. } else {
  1625. Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
  1626. "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:"
  1627. "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:"
  1628. "256:256-v256:256:256-v512:512:512-v1024:1024:1024";
  1629. }
  1630. return Ret;
  1631. }
  1632. Function *
  1633. GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
  1634. SetVector<Value *> &SubtreeValues) {
  1635. std::vector<Type *> Args;
  1636. std::string Identifier = getKernelFuncName(Kernel->id);
  1637. std::vector<Metadata *> MemoryType;
  1638. for (long i = 0; i < Prog->n_array; i++) {
  1639. if (!ppcg_kernel_requires_array_argument(Kernel, i))
  1640. continue;
  1641. if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
  1642. isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
  1643. const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage(Id));
  1644. Args.push_back(SAI->getElementType());
  1645. MemoryType.push_back(
  1646. ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
  1647. } else {
  1648. static const int UseGlobalMemory = 1;
  1649. Args.push_back(Builder.getInt8PtrTy(UseGlobalMemory));
  1650. MemoryType.push_back(
  1651. ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 1)));
  1652. }
  1653. }
  1654. int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set);
  1655. for (long i = 0; i < NumHostIters; i++) {
  1656. Args.push_back(Builder.getInt64Ty());
  1657. MemoryType.push_back(
  1658. ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
  1659. }
  1660. int NumVars = isl_space_dim(Kernel->space, isl_dim_param);
  1661. for (long i = 0; i < NumVars; i++) {
  1662. isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i);
  1663. Value *Val = IDToValue[Id];
  1664. isl_id_free(Id);
  1665. Args.push_back(Val->getType());
  1666. MemoryType.push_back(
  1667. ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
  1668. }
  1669. for (auto *V : SubtreeValues) {
  1670. Args.push_back(V->getType());
  1671. MemoryType.push_back(
  1672. ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
  1673. }
  1674. auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
  1675. auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
  1676. GPUModule.get());
  1677. std::vector<Metadata *> EmptyStrings;
  1678. for (unsigned int i = 0; i < MemoryType.size(); i++) {
  1679. EmptyStrings.push_back(MDString::get(FN->getContext(), ""));
  1680. }
  1681. if (Arch == GPUArch::SPIR32 || Arch == GPUArch::SPIR64) {
  1682. FN->setMetadata("kernel_arg_addr_space",
  1683. MDNode::get(FN->getContext(), MemoryType));
  1684. FN->setMetadata("kernel_arg_name",
  1685. MDNode::get(FN->getContext(), EmptyStrings));
  1686. FN->setMetadata("kernel_arg_access_qual",
  1687. MDNode::get(FN->getContext(), EmptyStrings));
  1688. FN->setMetadata("kernel_arg_type",
  1689. MDNode::get(FN->getContext(), EmptyStrings));
  1690. FN->setMetadata("kernel_arg_type_qual",
  1691. MDNode::get(FN->getContext(), EmptyStrings));
  1692. FN->setMetadata("kernel_arg_base_type",
  1693. MDNode::get(FN->getContext(), EmptyStrings));
  1694. }
  1695. switch (Arch) {
  1696. case GPUArch::NVPTX64:
  1697. FN->setCallingConv(CallingConv::PTX_Kernel);
  1698. break;
  1699. case GPUArch::SPIR32:
  1700. case GPUArch::SPIR64:
  1701. FN->setCallingConv(CallingConv::SPIR_KERNEL);
  1702. break;
  1703. }
  1704. auto Arg = FN->arg_begin();
  1705. for (long i = 0; i < Kernel->n_array; i++) {
  1706. if (!ppcg_kernel_requires_array_argument(Kernel, i))
  1707. continue;
  1708. Arg->setName(Kernel->array[i].array->name);
  1709. isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
  1710. const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage_copy(Id));
  1711. Type *EleTy = SAI->getElementType();
  1712. Value *Val = &*Arg;
  1713. SmallVector<const SCEV *, 4> Sizes;
  1714. isl_ast_build *Build =
  1715. isl_ast_build_from_context(isl_set_copy(Prog->context));
  1716. Sizes.push_back(nullptr);
  1717. for (long j = 1, n = Kernel->array[i].array->n_index; j < n; j++) {
  1718. isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff(
  1719. Build, isl_multi_pw_aff_get_pw_aff(Kernel->array[i].array->bound, j));
  1720. auto V = ExprBuilder.create(DimSize);
  1721. Sizes.push_back(SE.getSCEV(V));
  1722. }
  1723. const ScopArrayInfo *SAIRep =
  1724. S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, MemoryKind::Array);
  1725. LocalArrays.push_back(Val);
  1726. isl_ast_build_free(Build);
  1727. KernelIds.push_back(Id);
  1728. IDToSAI[Id] = SAIRep;
  1729. Arg++;
  1730. }
  1731. for (long i = 0; i < NumHostIters; i++) {
  1732. isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
  1733. Arg->setName(isl_id_get_name(Id));
  1734. IDToValue[Id] = &*Arg;
  1735. KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
  1736. Arg++;
  1737. }
  1738. for (long i = 0; i < NumVars; i++) {
  1739. isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i);
  1740. Arg->setName(isl_id_get_name(Id));
  1741. Value *Val = IDToValue[Id];
  1742. ValueMap[Val] = &*Arg;
  1743. IDToValue[Id] = &*Arg;
  1744. KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
  1745. Arg++;
  1746. }
  1747. for (auto *V : SubtreeValues) {
  1748. Arg->setName(V->getName());
  1749. ValueMap[V] = &*Arg;
  1750. Arg++;
  1751. }
  1752. return FN;
  1753. }
  1754. void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) {
  1755. Intrinsic::ID IntrinsicsBID[2];
  1756. Intrinsic::ID IntrinsicsTID[3];
  1757. switch (Arch) {
  1758. case GPUArch::SPIR64:
  1759. case GPUArch::SPIR32:
  1760. llvm_unreachable("Cannot generate NVVM intrinsics for SPIR");
  1761. case GPUArch::NVPTX64:
  1762. IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
  1763. IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y;
  1764. IntrinsicsTID[0] = Intrinsic::nvvm_read_ptx_sreg_tid_x;
  1765. IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y;
  1766. IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z;
  1767. break;
  1768. }
  1769. auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable {
  1770. std::string Name = isl_id_get_name(Id);
  1771. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  1772. Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr);
  1773. Value *Val = Builder.CreateCall(IntrinsicFn, {});
  1774. Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name);
  1775. IDToValue[Id] = Val;
  1776. KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
  1777. };
  1778. for (int i = 0; i < Kernel->n_grid; ++i) {
  1779. isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i);
  1780. addId(Id, IntrinsicsBID[i]);
  1781. }
  1782. for (int i = 0; i < Kernel->n_block; ++i) {
  1783. isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i);
  1784. addId(Id, IntrinsicsTID[i]);
  1785. }
  1786. }
  1787. void GPUNodeBuilder::insertKernelCallsSPIR(ppcg_kernel *Kernel,
  1788. bool SizeTypeIs64bit) {
  1789. const char *GroupName[3] = {"__gen_ocl_get_group_id0",
  1790. "__gen_ocl_get_group_id1",
  1791. "__gen_ocl_get_group_id2"};
  1792. const char *LocalName[3] = {"__gen_ocl_get_local_id0",
  1793. "__gen_ocl_get_local_id1",
  1794. "__gen_ocl_get_local_id2"};
  1795. IntegerType *SizeT =
  1796. SizeTypeIs64bit ? Builder.getInt64Ty() : Builder.getInt32Ty();
  1797. auto createFunc = [this](const char *Name, __isl_take isl_id *Id,
  1798. IntegerType *SizeT) mutable {
  1799. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  1800. Function *FN = M->getFunction(Name);
  1801. // If FN is not available, declare it.
  1802. if (!FN) {
  1803. GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
  1804. std::vector<Type *> Args;
  1805. FunctionType *Ty = FunctionType::get(SizeT, Args, false);
  1806. FN = Function::Create(Ty, Linkage, Name, M);
  1807. FN->setCallingConv(CallingConv::SPIR_FUNC);
  1808. }
  1809. Value *Val = Builder.CreateCall(FN, {});
  1810. if (SizeT == Builder.getInt32Ty())
  1811. Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name);
  1812. IDToValue[Id] = Val;
  1813. KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
  1814. };
  1815. for (int i = 0; i < Kernel->n_grid; ++i)
  1816. createFunc(GroupName[i], isl_id_list_get_id(Kernel->block_ids, i), SizeT);
  1817. for (int i = 0; i < Kernel->n_block; ++i)
  1818. createFunc(LocalName[i], isl_id_list_get_id(Kernel->thread_ids, i), SizeT);
  1819. }
  1820. void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) {
  1821. auto Arg = FN->arg_begin();
  1822. for (long i = 0; i < Kernel->n_array; i++) {
  1823. if (!ppcg_kernel_requires_array_argument(Kernel, i))
  1824. continue;
  1825. isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
  1826. const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage_copy(Id));
  1827. isl_id_free(Id);
  1828. if (SAI->getNumberOfDimensions() > 0) {
  1829. Arg++;
  1830. continue;
  1831. }
  1832. Value *Val = &*Arg;
  1833. if (!gpu_array_is_read_only_scalar(&Prog->array[i])) {
  1834. Type *TypePtr = SAI->getElementType()->getPointerTo();
  1835. Value *TypedArgPtr = Builder.CreatePointerCast(Val, TypePtr);
  1836. Val = Builder.CreateLoad(SAI->getElementType(), TypedArgPtr);
  1837. }
  1838. Value *Alloca = BlockGen.getOrCreateAlloca(SAI);
  1839. Builder.CreateStore(Val, Alloca);
  1840. Arg++;
  1841. }
  1842. }
  1843. void GPUNodeBuilder::finalizeKernelArguments(ppcg_kernel *Kernel) {
  1844. auto *FN = Builder.GetInsertBlock()->getParent();
  1845. auto Arg = FN->arg_begin();
  1846. bool StoredScalar = false;
  1847. for (long i = 0; i < Kernel->n_array; i++) {
  1848. if (!ppcg_kernel_requires_array_argument(Kernel, i))
  1849. continue;
  1850. isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
  1851. const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage_copy(Id));
  1852. isl_id_free(Id);
  1853. if (SAI->getNumberOfDimensions() > 0) {
  1854. Arg++;
  1855. continue;
  1856. }
  1857. if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
  1858. Arg++;
  1859. continue;
  1860. }
  1861. Value *Alloca = BlockGen.getOrCreateAlloca(SAI);
  1862. Value *ArgPtr = &*Arg;
  1863. Type *TypePtr = SAI->getElementType()->getPointerTo();
  1864. Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr);
  1865. Value *Val = Builder.CreateLoad(SAI->getElementType(), Alloca);
  1866. Builder.CreateStore(Val, TypedArgPtr);
  1867. StoredScalar = true;
  1868. Arg++;
  1869. }
  1870. if (StoredScalar) {
  1871. /// In case more than one thread contains scalar stores, the generated
  1872. /// code might be incorrect, if we only store at the end of the kernel.
  1873. /// To support this case we need to store these scalars back at each
  1874. /// memory store or at least before each kernel barrier.
  1875. if (Kernel->n_block != 0 || Kernel->n_grid != 0) {
  1876. BuildSuccessful = 0;
  1877. LLVM_DEBUG(
  1878. dbgs() << getUniqueScopName(&S)
  1879. << " has a store to a scalar value that"
  1880. " would be undefined to run in parallel. Bailing out.\n";);
  1881. }
  1882. }
  1883. }
  1884. void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) {
  1885. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  1886. for (int i = 0; i < Kernel->n_var; ++i) {
  1887. struct ppcg_kernel_var &Var = Kernel->var[i];
  1888. isl_id *Id = isl_space_get_tuple_id(Var.array->space, isl_dim_set);
  1889. Type *EleTy = ScopArrayInfo::getFromId(isl::manage(Id))->getElementType();
  1890. Type *ArrayTy = EleTy;
  1891. SmallVector<const SCEV *, 4> Sizes;
  1892. Sizes.push_back(nullptr);
  1893. for (unsigned int j = 1; j < Var.array->n_index; ++j) {
  1894. isl_val *Val = isl_vec_get_element_val(Var.size, j);
  1895. long Bound = isl_val_get_num_si(Val);
  1896. isl_val_free(Val);
  1897. Sizes.push_back(S.getSE()->getConstant(Builder.getInt64Ty(), Bound));
  1898. }
  1899. for (int j = Var.array->n_index - 1; j >= 0; --j) {
  1900. isl_val *Val = isl_vec_get_element_val(Var.size, j);
  1901. long Bound = isl_val_get_num_si(Val);
  1902. isl_val_free(Val);
  1903. ArrayTy = ArrayType::get(ArrayTy, Bound);
  1904. }
  1905. const ScopArrayInfo *SAI;
  1906. Value *Allocation;
  1907. if (Var.type == ppcg_access_shared) {
  1908. auto GlobalVar = new GlobalVariable(
  1909. *M, ArrayTy, false, GlobalValue::InternalLinkage, 0, Var.name,
  1910. nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 3);
  1911. GlobalVar->setAlignment(llvm::Align(EleTy->getPrimitiveSizeInBits() / 8));
  1912. GlobalVar->setInitializer(Constant::getNullValue(ArrayTy));
  1913. Allocation = GlobalVar;
  1914. } else if (Var.type == ppcg_access_private) {
  1915. Allocation = Builder.CreateAlloca(ArrayTy, 0, "private_array");
  1916. } else {
  1917. llvm_unreachable("unknown variable type");
  1918. }
  1919. SAI =
  1920. S.getOrCreateScopArrayInfo(Allocation, EleTy, Sizes, MemoryKind::Array);
  1921. Id = isl_id_alloc(S.getIslCtx().get(), Var.name, nullptr);
  1922. IDToValue[Id] = Allocation;
  1923. LocalArrays.push_back(Allocation);
  1924. KernelIds.push_back(Id);
  1925. IDToSAI[Id] = SAI;
  1926. }
  1927. }
  1928. void GPUNodeBuilder::createKernelFunction(
  1929. ppcg_kernel *Kernel, SetVector<Value *> &SubtreeValues,
  1930. SetVector<Function *> &SubtreeFunctions) {
  1931. std::string Identifier = getKernelFuncName(Kernel->id);
  1932. GPUModule.reset(new Module(Identifier, Builder.getContext()));
  1933. switch (Arch) {
  1934. case GPUArch::NVPTX64:
  1935. if (Runtime == GPURuntime::CUDA)
  1936. GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
  1937. else if (Runtime == GPURuntime::OpenCL)
  1938. GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl"));
  1939. GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
  1940. break;
  1941. case GPUArch::SPIR32:
  1942. GPUModule->setTargetTriple(Triple::normalize("spir-unknown-unknown"));
  1943. GPUModule->setDataLayout(computeSPIRDataLayout(false /* is64Bit */));
  1944. break;
  1945. case GPUArch::SPIR64:
  1946. GPUModule->setTargetTriple(Triple::normalize("spir64-unknown-unknown"));
  1947. GPUModule->setDataLayout(computeSPIRDataLayout(true /* is64Bit */));
  1948. break;
  1949. }
  1950. Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues);
  1951. BasicBlock *PrevBlock = Builder.GetInsertBlock();
  1952. auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN);
  1953. DT.addNewBlock(EntryBlock, PrevBlock);
  1954. Builder.SetInsertPoint(EntryBlock);
  1955. Builder.CreateRetVoid();
  1956. Builder.SetInsertPoint(EntryBlock, EntryBlock->begin());
  1957. ScopDetection::markFunctionAsInvalid(FN);
  1958. prepareKernelArguments(Kernel, FN);
  1959. createKernelVariables(Kernel, FN);
  1960. switch (Arch) {
  1961. case GPUArch::NVPTX64:
  1962. insertKernelIntrinsics(Kernel);
  1963. break;
  1964. case GPUArch::SPIR32:
  1965. insertKernelCallsSPIR(Kernel, false);
  1966. break;
  1967. case GPUArch::SPIR64:
  1968. insertKernelCallsSPIR(Kernel, true);
  1969. break;
  1970. }
  1971. }
  1972. std::string GPUNodeBuilder::createKernelASM() {
  1973. llvm::Triple GPUTriple;
  1974. switch (Arch) {
  1975. case GPUArch::NVPTX64:
  1976. switch (Runtime) {
  1977. case GPURuntime::CUDA:
  1978. GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda"));
  1979. break;
  1980. case GPURuntime::OpenCL:
  1981. GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl"));
  1982. break;
  1983. }
  1984. break;
  1985. case GPUArch::SPIR64:
  1986. case GPUArch::SPIR32:
  1987. std::string SPIRAssembly;
  1988. raw_string_ostream IROstream(SPIRAssembly);
  1989. IROstream << *GPUModule;
  1990. IROstream.flush();
  1991. return SPIRAssembly;
  1992. }
  1993. std::string ErrMsg;
  1994. auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg);
  1995. if (!GPUTarget) {
  1996. errs() << ErrMsg << "\n";
  1997. return "";
  1998. }
  1999. TargetOptions Options;
  2000. Options.UnsafeFPMath = FastMath;
  2001. std::string subtarget;
  2002. switch (Arch) {
  2003. case GPUArch::NVPTX64:
  2004. subtarget = CudaVersion;
  2005. break;
  2006. case GPUArch::SPIR32:
  2007. case GPUArch::SPIR64:
  2008. llvm_unreachable("No subtarget for SPIR architecture");
  2009. }
  2010. std::unique_ptr<TargetMachine> TargetM(GPUTarget->createTargetMachine(
  2011. GPUTriple.getTriple(), subtarget, "", Options, std::nullopt));
  2012. SmallString<0> ASMString;
  2013. raw_svector_ostream ASMStream(ASMString);
  2014. llvm::legacy::PassManager PM;
  2015. PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis()));
  2016. if (TargetM->addPassesToEmitFile(PM, ASMStream, nullptr, CGFT_AssemblyFile,
  2017. true /* verify */)) {
  2018. errs() << "The target does not support generation of this file type!\n";
  2019. return "";
  2020. }
  2021. PM.run(*GPUModule);
  2022. return ASMStream.str().str();
  2023. }
  2024. bool GPUNodeBuilder::requiresCUDALibDevice() {
  2025. bool RequiresLibDevice = false;
  2026. for (Function &F : GPUModule->functions()) {
  2027. if (!F.isDeclaration())
  2028. continue;
  2029. const std::string CUDALibDeviceFunc = getCUDALibDeviceFuntion(F.getName());
  2030. if (CUDALibDeviceFunc.length() != 0) {
  2031. // We need to handle the case where a module looks like this:
  2032. // @expf(..)
  2033. // @llvm.exp.f64(..)
  2034. // Both of these functions would be renamed to `__nv_expf`.
  2035. //
  2036. // So, we must first check for the existence of the libdevice function.
  2037. // If this exists, we replace our current function with it.
  2038. //
  2039. // If it does not exist, we rename the current function to the
  2040. // libdevice functiono name.
  2041. if (Function *Replacement = F.getParent()->getFunction(CUDALibDeviceFunc))
  2042. F.replaceAllUsesWith(Replacement);
  2043. else
  2044. F.setName(CUDALibDeviceFunc);
  2045. RequiresLibDevice = true;
  2046. }
  2047. }
  2048. return RequiresLibDevice;
  2049. }
  2050. void GPUNodeBuilder::addCUDALibDevice() {
  2051. if (Arch != GPUArch::NVPTX64)
  2052. return;
  2053. if (requiresCUDALibDevice()) {
  2054. SMDiagnostic Error;
  2055. errs() << CUDALibDevice << "\n";
  2056. auto LibDeviceModule =
  2057. parseIRFile(CUDALibDevice, Error, GPUModule->getContext());
  2058. if (!LibDeviceModule) {
  2059. BuildSuccessful = false;
  2060. report_fatal_error("Could not find or load libdevice. Skipping GPU "
  2061. "kernel generation. Please set -polly-acc-libdevice "
  2062. "accordingly.\n");
  2063. return;
  2064. }
  2065. Linker L(*GPUModule);
  2066. // Set an nvptx64 target triple to avoid linker warnings. The original
  2067. // triple of the libdevice files are nvptx-unknown-unknown.
  2068. LibDeviceModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
  2069. L.linkInModule(std::move(LibDeviceModule), Linker::LinkOnlyNeeded);
  2070. }
  2071. }
  2072. std::string GPUNodeBuilder::finalizeKernelFunction() {
  2073. if (verifyModule(*GPUModule)) {
  2074. LLVM_DEBUG(dbgs() << "verifyModule failed on module:\n";
  2075. GPUModule->print(dbgs(), nullptr); dbgs() << "\n";);
  2076. LLVM_DEBUG(dbgs() << "verifyModule Error:\n";
  2077. verifyModule(*GPUModule, &dbgs()););
  2078. if (FailOnVerifyModuleFailure)
  2079. llvm_unreachable("VerifyModule failed.");
  2080. BuildSuccessful = false;
  2081. return "";
  2082. }
  2083. addCUDALibDevice();
  2084. if (DumpKernelIR)
  2085. outs() << *GPUModule << "\n";
  2086. if (Arch != GPUArch::SPIR32 && Arch != GPUArch::SPIR64) {
  2087. // Optimize module.
  2088. llvm::legacy::PassManager OptPasses;
  2089. PassManagerBuilder PassBuilder;
  2090. PassBuilder.OptLevel = 3;
  2091. PassBuilder.SizeLevel = 0;
  2092. PassBuilder.populateModulePassManager(OptPasses);
  2093. OptPasses.run(*GPUModule);
  2094. }
  2095. std::string Assembly = createKernelASM();
  2096. if (DumpKernelASM)
  2097. outs() << Assembly << "\n";
  2098. GPUModule.release();
  2099. KernelIDs.clear();
  2100. return Assembly;
  2101. }
  2102. /// Construct an `isl_pw_aff_list` from a vector of `isl_pw_aff`
  2103. /// @param PwAffs The list of piecewise affine functions to create an
  2104. /// `isl_pw_aff_list` from. We expect an rvalue ref because
  2105. /// all the isl_pw_aff are used up by this function.
  2106. ///
  2107. /// @returns The `isl_pw_aff_list`.
  2108. __isl_give isl_pw_aff_list *
  2109. createPwAffList(isl_ctx *Context,
  2110. const std::vector<__isl_take isl_pw_aff *> &&PwAffs) {
  2111. isl_pw_aff_list *List = isl_pw_aff_list_alloc(Context, PwAffs.size());
  2112. for (unsigned i = 0; i < PwAffs.size(); i++) {
  2113. List = isl_pw_aff_list_insert(List, i, PwAffs[i]);
  2114. }
  2115. return List;
  2116. }
  2117. /// Align all the `PwAffs` such that they have the same parameter dimensions.
  2118. ///
  2119. /// We loop over all `pw_aff` and align all of their spaces together to
  2120. /// create a common space for all the `pw_aff`. This common space is the
  2121. /// `AlignSpace`. We then align all the `pw_aff` to this space. We start
  2122. /// with the given `SeedSpace`.
  2123. /// @param PwAffs The list of piecewise affine functions we want to align.
  2124. /// This is an rvalue reference because the entire vector is
  2125. /// used up by the end of the operation.
  2126. /// @param SeedSpace The space to start the alignment process with.
  2127. /// @returns A std::pair, whose first element is the aligned space,
  2128. /// whose second element is the vector of aligned piecewise
  2129. /// affines.
  2130. static std::pair<__isl_give isl_space *, std::vector<__isl_give isl_pw_aff *>>
  2131. alignPwAffs(const std::vector<__isl_take isl_pw_aff *> &&PwAffs,
  2132. __isl_take isl_space *SeedSpace) {
  2133. assert(SeedSpace && "Invalid seed space given.");
  2134. isl_space *AlignSpace = SeedSpace;
  2135. for (isl_pw_aff *PwAff : PwAffs) {
  2136. isl_space *PwAffSpace = isl_pw_aff_get_domain_space(PwAff);
  2137. AlignSpace = isl_space_align_params(AlignSpace, PwAffSpace);
  2138. }
  2139. std::vector<isl_pw_aff *> AdjustedPwAffs;
  2140. for (unsigned i = 0; i < PwAffs.size(); i++) {
  2141. isl_pw_aff *Adjusted = PwAffs[i];
  2142. assert(Adjusted && "Invalid pw_aff given.");
  2143. Adjusted = isl_pw_aff_align_params(Adjusted, isl_space_copy(AlignSpace));
  2144. AdjustedPwAffs.push_back(Adjusted);
  2145. }
  2146. return std::make_pair(AlignSpace, AdjustedPwAffs);
  2147. }
  2148. namespace {
  2149. class PPCGCodeGeneration final : public ScopPass {
  2150. public:
  2151. static char ID;
  2152. GPURuntime Runtime = GPURuntime::CUDA;
  2153. GPUArch Architecture = GPUArch::NVPTX64;
  2154. /// The scop that is currently processed.
  2155. Scop *S;
  2156. LoopInfo *LI;
  2157. DominatorTree *DT;
  2158. ScalarEvolution *SE;
  2159. const DataLayout *DL;
  2160. RegionInfo *RI;
  2161. PPCGCodeGeneration() : ScopPass(ID) {
  2162. // Apply defaults.
  2163. Runtime = GPURuntimeChoice;
  2164. Architecture = GPUArchChoice;
  2165. }
  2166. /// Construct compilation options for PPCG.
  2167. ///
  2168. /// @returns The compilation options.
  2169. ppcg_options *createPPCGOptions() {
  2170. auto DebugOptions =
  2171. (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options));
  2172. auto Options = (ppcg_options *)malloc(sizeof(ppcg_options));
  2173. DebugOptions->dump_schedule_constraints = false;
  2174. DebugOptions->dump_schedule = false;
  2175. DebugOptions->dump_final_schedule = false;
  2176. DebugOptions->dump_sizes = false;
  2177. DebugOptions->verbose = false;
  2178. Options->debug = DebugOptions;
  2179. Options->group_chains = false;
  2180. Options->reschedule = true;
  2181. Options->scale_tile_loops = false;
  2182. Options->wrap = false;
  2183. Options->non_negative_parameters = false;
  2184. Options->ctx = nullptr;
  2185. Options->sizes = nullptr;
  2186. Options->tile = true;
  2187. Options->tile_size = 32;
  2188. Options->isolate_full_tiles = false;
  2189. Options->use_private_memory = PrivateMemory;
  2190. Options->use_shared_memory = SharedMemory;
  2191. Options->max_shared_memory = 48 * 1024;
  2192. Options->target = PPCG_TARGET_CUDA;
  2193. Options->openmp = false;
  2194. Options->linearize_device_arrays = true;
  2195. Options->allow_gnu_extensions = false;
  2196. Options->unroll_copy_shared = false;
  2197. Options->unroll_gpu_tile = false;
  2198. Options->live_range_reordering = true;
  2199. Options->live_range_reordering = true;
  2200. Options->hybrid = false;
  2201. Options->opencl_compiler_options = nullptr;
  2202. Options->opencl_use_gpu = false;
  2203. Options->opencl_n_include_file = 0;
  2204. Options->opencl_include_files = nullptr;
  2205. Options->opencl_print_kernel_types = false;
  2206. Options->opencl_embed_kernel_code = false;
  2207. Options->save_schedule_file = nullptr;
  2208. Options->load_schedule_file = nullptr;
  2209. return Options;
  2210. }
  2211. /// Get a tagged access relation containing all accesses of type @p AccessTy.
  2212. ///
  2213. /// Instead of a normal access of the form:
  2214. ///
  2215. /// Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)]
  2216. ///
  2217. /// a tagged access has the form
  2218. ///
  2219. /// [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)]
  2220. ///
  2221. /// where 'id' is an additional space that references the memory access that
  2222. /// triggered the access.
  2223. ///
  2224. /// @param AccessTy The type of the memory accesses to collect.
  2225. ///
  2226. /// @return The relation describing all tagged memory accesses.
  2227. isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) {
  2228. isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace().release());
  2229. for (auto &Stmt : *S)
  2230. for (auto &Acc : Stmt)
  2231. if (Acc->getType() == AccessTy) {
  2232. isl_map *Relation = Acc->getAccessRelation().release();
  2233. Relation =
  2234. isl_map_intersect_domain(Relation, Stmt.getDomain().release());
  2235. isl_space *Space = isl_map_get_space(Relation);
  2236. Space = isl_space_range(Space);
  2237. Space = isl_space_from_range(Space);
  2238. Space =
  2239. isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId().release());
  2240. isl_map *Universe = isl_map_universe(Space);
  2241. Relation = isl_map_domain_product(Relation, Universe);
  2242. Accesses = isl_union_map_add_map(Accesses, Relation);
  2243. }
  2244. return Accesses;
  2245. }
  2246. /// Get the set of all read accesses, tagged with the access id.
  2247. ///
  2248. /// @see getTaggedAccesses
  2249. isl_union_map *getTaggedReads() {
  2250. return getTaggedAccesses(MemoryAccess::READ);
  2251. }
  2252. /// Get the set of all may (and must) accesses, tagged with the access id.
  2253. ///
  2254. /// @see getTaggedAccesses
  2255. isl_union_map *getTaggedMayWrites() {
  2256. return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE),
  2257. getTaggedAccesses(MemoryAccess::MUST_WRITE));
  2258. }
  2259. /// Get the set of all must accesses, tagged with the access id.
  2260. ///
  2261. /// @see getTaggedAccesses
  2262. isl_union_map *getTaggedMustWrites() {
  2263. return getTaggedAccesses(MemoryAccess::MUST_WRITE);
  2264. }
  2265. /// Collect parameter and array names as isl_ids.
  2266. ///
  2267. /// To reason about the different parameters and arrays used, ppcg requires
  2268. /// a list of all isl_ids in use. As PPCG traditionally performs
  2269. /// source-to-source compilation each of these isl_ids is mapped to the
  2270. /// expression that represents it. As we do not have a corresponding
  2271. /// expression in Polly, we just map each id to a 'zero' expression to match
  2272. /// the data format that ppcg expects.
  2273. ///
  2274. /// @returns Retun a map from collected ids to 'zero' ast expressions.
  2275. __isl_give isl_id_to_ast_expr *getNames() {
  2276. auto *Names = isl_id_to_ast_expr_alloc(
  2277. S->getIslCtx().get(),
  2278. S->getNumParams() + std::distance(S->array_begin(), S->array_end()));
  2279. auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx().get()));
  2280. for (const SCEV *P : S->parameters()) {
  2281. isl_id *Id = S->getIdForParam(P).release();
  2282. Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero));
  2283. }
  2284. for (auto &Array : S->arrays()) {
  2285. auto Id = Array->getBasePtrId().release();
  2286. Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero));
  2287. }
  2288. isl_ast_expr_free(Zero);
  2289. return Names;
  2290. }
  2291. /// Create a new PPCG scop from the current scop.
  2292. ///
  2293. /// The PPCG scop is initialized with data from the current polly::Scop. From
  2294. /// this initial data, the data-dependences in the PPCG scop are initialized.
  2295. /// We do not use Polly's dependence analysis for now, to ensure we match
  2296. /// the PPCG default behaviour more closely.
  2297. ///
  2298. /// @returns A new ppcg scop.
  2299. ppcg_scop *createPPCGScop() {
  2300. MustKillsInfo KillsInfo = computeMustKillsInfo(*S);
  2301. auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop));
  2302. PPCGScop->options = createPPCGOptions();
  2303. // enable live range reordering
  2304. PPCGScop->options->live_range_reordering = 1;
  2305. PPCGScop->start = 0;
  2306. PPCGScop->end = 0;
  2307. PPCGScop->context = S->getContext().release();
  2308. PPCGScop->domain = S->getDomains().release();
  2309. // TODO: investigate this further. PPCG calls collect_call_domains.
  2310. PPCGScop->call = isl_union_set_from_set(S->getContext().release());
  2311. PPCGScop->tagged_reads = getTaggedReads();
  2312. PPCGScop->reads = S->getReads().release();
  2313. PPCGScop->live_in = nullptr;
  2314. PPCGScop->tagged_may_writes = getTaggedMayWrites();
  2315. PPCGScop->may_writes = S->getWrites().release();
  2316. PPCGScop->tagged_must_writes = getTaggedMustWrites();
  2317. PPCGScop->must_writes = S->getMustWrites().release();
  2318. PPCGScop->live_out = nullptr;
  2319. PPCGScop->tagged_must_kills = KillsInfo.TaggedMustKills.release();
  2320. PPCGScop->must_kills = KillsInfo.MustKills.release();
  2321. PPCGScop->tagger = nullptr;
  2322. PPCGScop->independence =
  2323. isl_union_map_empty(isl_set_get_space(PPCGScop->context));
  2324. PPCGScop->dep_flow = nullptr;
  2325. PPCGScop->tagged_dep_flow = nullptr;
  2326. PPCGScop->dep_false = nullptr;
  2327. PPCGScop->dep_forced = nullptr;
  2328. PPCGScop->dep_order = nullptr;
  2329. PPCGScop->tagged_dep_order = nullptr;
  2330. PPCGScop->schedule = S->getScheduleTree().release();
  2331. // If we have something non-trivial to kill, add it to the schedule
  2332. if (KillsInfo.KillsSchedule.get())
  2333. PPCGScop->schedule = isl_schedule_sequence(
  2334. PPCGScop->schedule, KillsInfo.KillsSchedule.release());
  2335. PPCGScop->names = getNames();
  2336. PPCGScop->pet = nullptr;
  2337. compute_tagger(PPCGScop);
  2338. compute_dependences(PPCGScop);
  2339. eliminate_dead_code(PPCGScop);
  2340. return PPCGScop;
  2341. }
  2342. /// Collect the array accesses in a statement.
  2343. ///
  2344. /// @param Stmt The statement for which to collect the accesses.
  2345. ///
  2346. /// @returns A list of array accesses.
  2347. gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) {
  2348. gpu_stmt_access *Accesses = nullptr;
  2349. for (MemoryAccess *Acc : Stmt) {
  2350. auto Access =
  2351. isl_alloc_type(S->getIslCtx().get(), struct gpu_stmt_access);
  2352. Access->read = Acc->isRead();
  2353. Access->write = Acc->isWrite();
  2354. Access->access = Acc->getAccessRelation().release();
  2355. isl_space *Space = isl_map_get_space(Access->access);
  2356. Space = isl_space_range(Space);
  2357. Space = isl_space_from_range(Space);
  2358. Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId().release());
  2359. isl_map *Universe = isl_map_universe(Space);
  2360. Access->tagged_access =
  2361. isl_map_domain_product(Acc->getAccessRelation().release(), Universe);
  2362. Access->exact_write = !Acc->isMayWrite();
  2363. Access->ref_id = Acc->getId().release();
  2364. Access->next = Accesses;
  2365. Access->n_index = Acc->getScopArrayInfo()->getNumberOfDimensions();
  2366. // TODO: Also mark one-element accesses to arrays as fixed-element.
  2367. Access->fixed_element =
  2368. Acc->isLatestScalarKind() ? isl_bool_true : isl_bool_false;
  2369. Accesses = Access;
  2370. }
  2371. return Accesses;
  2372. }
  2373. /// Collect the list of GPU statements.
  2374. ///
  2375. /// Each statement has an id, a pointer to the underlying data structure,
  2376. /// as well as a list with all memory accesses.
  2377. ///
  2378. /// TODO: Initialize the list of memory accesses.
  2379. ///
  2380. /// @returns A linked-list of statements.
  2381. gpu_stmt *getStatements() {
  2382. gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx().get(), struct gpu_stmt,
  2383. std::distance(S->begin(), S->end()));
  2384. int i = 0;
  2385. for (auto &Stmt : *S) {
  2386. gpu_stmt *GPUStmt = &Stmts[i];
  2387. GPUStmt->id = Stmt.getDomainId().release();
  2388. // We use the pet stmt pointer to keep track of the Polly statements.
  2389. GPUStmt->stmt = (pet_stmt *)&Stmt;
  2390. GPUStmt->accesses = getStmtAccesses(Stmt);
  2391. i++;
  2392. }
  2393. return Stmts;
  2394. }
  2395. /// Derive the extent of an array.
  2396. ///
  2397. /// The extent of an array is the set of elements that are within the
  2398. /// accessed array. For the inner dimensions, the extent constraints are
  2399. /// 0 and the size of the corresponding array dimension. For the first
  2400. /// (outermost) dimension, the extent constraints are the minimal and maximal
  2401. /// subscript value for the first dimension.
  2402. ///
  2403. /// @param Array The array to derive the extent for.
  2404. ///
  2405. /// @returns An isl_set describing the extent of the array.
  2406. isl::set getExtent(ScopArrayInfo *Array) {
  2407. unsigned NumDims = Array->getNumberOfDimensions();
  2408. if (Array->getNumberOfDimensions() == 0)
  2409. return isl::set::universe(Array->getSpace());
  2410. isl::union_map Accesses = S->getAccesses(Array);
  2411. isl::union_set AccessUSet = Accesses.range();
  2412. AccessUSet = AccessUSet.coalesce();
  2413. AccessUSet = AccessUSet.detect_equalities();
  2414. AccessUSet = AccessUSet.coalesce();
  2415. if (AccessUSet.is_empty())
  2416. return isl::set::empty(Array->getSpace());
  2417. isl::set AccessSet = AccessUSet.extract_set(Array->getSpace());
  2418. isl::local_space LS = isl::local_space(Array->getSpace());
  2419. isl::pw_aff Val = isl::aff::var_on_domain(LS, isl::dim::set, 0);
  2420. isl::pw_aff OuterMin = AccessSet.dim_min(0);
  2421. isl::pw_aff OuterMax = AccessSet.dim_max(0);
  2422. OuterMin = OuterMin.add_dims(isl::dim::in,
  2423. unsignedFromIslSize(Val.dim(isl::dim::in)));
  2424. OuterMax = OuterMax.add_dims(isl::dim::in,
  2425. unsignedFromIslSize(Val.dim(isl::dim::in)));
  2426. OuterMin = OuterMin.set_tuple_id(isl::dim::in, Array->getBasePtrId());
  2427. OuterMax = OuterMax.set_tuple_id(isl::dim::in, Array->getBasePtrId());
  2428. isl::set Extent = isl::set::universe(Array->getSpace());
  2429. Extent = Extent.intersect(OuterMin.le_set(Val));
  2430. Extent = Extent.intersect(OuterMax.ge_set(Val));
  2431. for (unsigned i = 1; i < NumDims; ++i)
  2432. Extent = Extent.lower_bound_si(isl::dim::set, i, 0);
  2433. for (unsigned i = 0; i < NumDims; ++i) {
  2434. isl::pw_aff PwAff = Array->getDimensionSizePw(i);
  2435. // isl_pw_aff can be NULL for zero dimension. Only in the case of a
  2436. // Fortran array will we have a legitimate dimension.
  2437. if (PwAff.is_null()) {
  2438. assert(i == 0 && "invalid dimension isl_pw_aff for nonzero dimension");
  2439. continue;
  2440. }
  2441. isl::pw_aff Val = isl::aff::var_on_domain(
  2442. isl::local_space(Array->getSpace()), isl::dim::set, i);
  2443. PwAff = PwAff.add_dims(isl::dim::in,
  2444. unsignedFromIslSize(Val.dim(isl::dim::in)));
  2445. PwAff = PwAff.set_tuple_id(isl::dim::in, Val.get_tuple_id(isl::dim::in));
  2446. isl::set Set = PwAff.gt_set(Val);
  2447. Extent = Set.intersect(Extent);
  2448. }
  2449. return Extent;
  2450. }
  2451. /// Derive the bounds of an array.
  2452. ///
  2453. /// For the first dimension we derive the bound of the array from the extent
  2454. /// of this dimension. For inner dimensions we obtain their size directly from
  2455. /// ScopArrayInfo.
  2456. ///
  2457. /// @param PPCGArray The array to compute bounds for.
  2458. /// @param Array The polly array from which to take the information.
  2459. void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) {
  2460. std::vector<isl_pw_aff *> Bounds;
  2461. if (PPCGArray.n_index > 0) {
  2462. if (isl_set_is_empty(PPCGArray.extent)) {
  2463. isl_set *Dom = isl_set_copy(PPCGArray.extent);
  2464. isl_local_space *LS = isl_local_space_from_space(
  2465. isl_space_params(isl_set_get_space(Dom)));
  2466. isl_set_free(Dom);
  2467. isl_pw_aff *Zero = isl_pw_aff_from_aff(isl_aff_zero_on_domain(LS));
  2468. Bounds.push_back(Zero);
  2469. } else {
  2470. isl_set *Dom = isl_set_copy(PPCGArray.extent);
  2471. Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1);
  2472. isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0);
  2473. isl_set_free(Dom);
  2474. Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound));
  2475. isl_local_space *LS =
  2476. isl_local_space_from_space(isl_set_get_space(Dom));
  2477. isl_aff *One = isl_aff_zero_on_domain(LS);
  2478. One = isl_aff_add_constant_si(One, 1);
  2479. Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One));
  2480. Bound = isl_pw_aff_gist(Bound, S->getContext().release());
  2481. Bounds.push_back(Bound);
  2482. }
  2483. }
  2484. for (unsigned i = 1; i < PPCGArray.n_index; ++i) {
  2485. isl_pw_aff *Bound = Array->getDimensionSizePw(i).release();
  2486. auto LS = isl_pw_aff_get_domain_space(Bound);
  2487. auto Aff = isl_multi_aff_zero(LS);
  2488. // We need types to work out, which is why we perform this weird dance
  2489. // with `Aff` and `Bound`. Consider this example:
  2490. // LS: [p] -> { [] }
  2491. // Zero: [p] -> { [] } | Implicitly, is [p] -> { ~ -> [] }.
  2492. // This `~` is used to denote a "null space" (which is different from
  2493. // a *zero dimensional* space), which is something that ISL does not
  2494. // show you when pretty printing.
  2495. // Bound: [p] -> { [] -> [(10p)] } | Here, the [] is a *zero dimensional*
  2496. // space, not a "null space" which does not exist at all.
  2497. // When we pullback (precompose) `Bound` with `Zero`, we get:
  2498. // Bound . Zero =
  2499. // ([p] -> { [] -> [(10p)] }) . ([p] -> {~ -> [] }) =
  2500. // [p] -> { ~ -> [(10p)] } =
  2501. // [p] -> [(10p)] (as ISL pretty prints it)
  2502. // Bound Pullback: [p] -> { [(10p)] }
  2503. // We want this kind of an expression for Bound, without a
  2504. // zero dimensional input, but with a "null space" input for the types
  2505. // to work out later on, as far as I (Siddharth Bhat) understand.
  2506. // I was unable to find a reference to this in the ISL manual.
  2507. // References: Tobias Grosser.
  2508. Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff);
  2509. Bounds.push_back(Bound);
  2510. }
  2511. /// To construct a `isl_multi_pw_aff`, we need all the indivisual `pw_aff`
  2512. /// to have the same parameter dimensions. So, we need to align them to an
  2513. /// appropriate space.
  2514. /// Scop::Context is _not_ an appropriate space, because when we have
  2515. /// `-polly-ignore-parameter-bounds` enabled, the Scop::Context does not
  2516. /// contain all parameter dimensions.
  2517. /// So, use the helper `alignPwAffs` to align all the `isl_pw_aff` together.
  2518. isl_space *SeedAlignSpace = S->getParamSpace().release();
  2519. SeedAlignSpace = isl_space_add_dims(SeedAlignSpace, isl_dim_set, 1);
  2520. isl_space *AlignSpace = nullptr;
  2521. std::vector<isl_pw_aff *> AlignedBounds;
  2522. std::tie(AlignSpace, AlignedBounds) =
  2523. alignPwAffs(std::move(Bounds), SeedAlignSpace);
  2524. assert(AlignSpace && "alignPwAffs did not initialise AlignSpace");
  2525. isl_pw_aff_list *BoundsList =
  2526. createPwAffList(S->getIslCtx().get(), std::move(AlignedBounds));
  2527. isl_space *BoundsSpace = isl_set_get_space(PPCGArray.extent);
  2528. BoundsSpace = isl_space_align_params(BoundsSpace, AlignSpace);
  2529. assert(BoundsSpace && "Unable to access space of array.");
  2530. assert(BoundsList && "Unable to access list of bounds.");
  2531. PPCGArray.bound =
  2532. isl_multi_pw_aff_from_pw_aff_list(BoundsSpace, BoundsList);
  2533. assert(PPCGArray.bound && "PPCGArray.bound was not constructed correctly.");
  2534. }
  2535. /// Create the arrays for @p PPCGProg.
  2536. ///
  2537. /// @param PPCGProg The program to compute the arrays for.
  2538. void createArrays(gpu_prog *PPCGProg,
  2539. const SmallVector<ScopArrayInfo *, 4> &ValidSAIs) {
  2540. int i = 0;
  2541. for (auto &Array : ValidSAIs) {
  2542. std::string TypeName;
  2543. raw_string_ostream OS(TypeName);
  2544. OS << *Array->getElementType();
  2545. TypeName = OS.str();
  2546. gpu_array_info &PPCGArray = PPCGProg->array[i];
  2547. PPCGArray.space = Array->getSpace().release();
  2548. PPCGArray.type = strdup(TypeName.c_str());
  2549. PPCGArray.size = DL->getTypeAllocSize(Array->getElementType());
  2550. PPCGArray.name = strdup(Array->getName().c_str());
  2551. PPCGArray.extent = nullptr;
  2552. PPCGArray.n_index = Array->getNumberOfDimensions();
  2553. PPCGArray.extent = getExtent(Array).release();
  2554. PPCGArray.n_ref = 0;
  2555. PPCGArray.refs = nullptr;
  2556. PPCGArray.accessed = true;
  2557. PPCGArray.read_only_scalar =
  2558. Array->isReadOnly() && Array->getNumberOfDimensions() == 0;
  2559. PPCGArray.has_compound_element = false;
  2560. PPCGArray.local = false;
  2561. PPCGArray.declare_local = false;
  2562. PPCGArray.global = false;
  2563. PPCGArray.linearize = false;
  2564. PPCGArray.dep_order = nullptr;
  2565. PPCGArray.user = Array;
  2566. PPCGArray.bound = nullptr;
  2567. setArrayBounds(PPCGArray, Array);
  2568. i++;
  2569. collect_references(PPCGProg, &PPCGArray);
  2570. PPCGArray.only_fixed_element = only_fixed_element_accessed(&PPCGArray);
  2571. }
  2572. }
  2573. /// Create an identity map between the arrays in the scop.
  2574. ///
  2575. /// @returns An identity map between the arrays in the scop.
  2576. isl_union_map *getArrayIdentity() {
  2577. isl_union_map *Maps = isl_union_map_empty(S->getParamSpace().release());
  2578. for (auto &Array : S->arrays()) {
  2579. isl_space *Space = Array->getSpace().release();
  2580. Space = isl_space_map_from_set(Space);
  2581. isl_map *Identity = isl_map_identity(Space);
  2582. Maps = isl_union_map_add_map(Maps, Identity);
  2583. }
  2584. return Maps;
  2585. }
  2586. /// Create a default-initialized PPCG GPU program.
  2587. ///
  2588. /// @returns A new gpu program description.
  2589. gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) {
  2590. if (!PPCGScop)
  2591. return nullptr;
  2592. auto PPCGProg = isl_calloc_type(S->getIslCtx().get(), struct gpu_prog);
  2593. PPCGProg->ctx = S->getIslCtx().get();
  2594. PPCGProg->scop = PPCGScop;
  2595. PPCGProg->context = isl_set_copy(PPCGScop->context);
  2596. PPCGProg->read = isl_union_map_copy(PPCGScop->reads);
  2597. PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes);
  2598. PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes);
  2599. PPCGProg->tagged_must_kill =
  2600. isl_union_map_copy(PPCGScop->tagged_must_kills);
  2601. PPCGProg->to_inner = getArrayIdentity();
  2602. PPCGProg->to_outer = getArrayIdentity();
  2603. // TODO: verify that this assignment is correct.
  2604. PPCGProg->any_to_outer = nullptr;
  2605. PPCGProg->n_stmts = std::distance(S->begin(), S->end());
  2606. PPCGProg->stmts = getStatements();
  2607. // Only consider arrays that have a non-empty extent.
  2608. // Otherwise, this will cause us to consider the following kinds of
  2609. // empty arrays:
  2610. // 1. Invariant loads that are represented by SAI objects.
  2611. // 2. Arrays with statically known zero size.
  2612. auto ValidSAIsRange =
  2613. make_filter_range(S->arrays(), [this](ScopArrayInfo *SAI) -> bool {
  2614. return !getExtent(SAI).is_empty();
  2615. });
  2616. SmallVector<ScopArrayInfo *, 4> ValidSAIs(ValidSAIsRange.begin(),
  2617. ValidSAIsRange.end());
  2618. PPCGProg->n_array =
  2619. ValidSAIs.size(); // std::distance(S->array_begin(), S->array_end());
  2620. PPCGProg->array = isl_calloc_array(
  2621. S->getIslCtx().get(), struct gpu_array_info, PPCGProg->n_array);
  2622. createArrays(PPCGProg, ValidSAIs);
  2623. PPCGProg->array_order = nullptr;
  2624. collect_order_dependences(PPCGProg);
  2625. PPCGProg->may_persist = compute_may_persist(PPCGProg);
  2626. return PPCGProg;
  2627. }
  2628. struct PrintGPUUserData {
  2629. struct cuda_info *CudaInfo;
  2630. struct gpu_prog *PPCGProg;
  2631. std::vector<ppcg_kernel *> Kernels;
  2632. };
  2633. /// Print a user statement node in the host code.
  2634. ///
  2635. /// We use ppcg's printing facilities to print the actual statement and
  2636. /// additionally build up a list of all kernels that are encountered in the
  2637. /// host ast.
  2638. ///
  2639. /// @param P The printer to print to
  2640. /// @param Options The printing options to use
  2641. /// @param Node The node to print
  2642. /// @param User A user pointer to carry additional data. This pointer is
  2643. /// expected to be of type PrintGPUUserData.
  2644. ///
  2645. /// @returns A printer to which the output has been printed.
  2646. static __isl_give isl_printer *
  2647. printHostUser(__isl_take isl_printer *P,
  2648. __isl_take isl_ast_print_options *Options,
  2649. __isl_take isl_ast_node *Node, void *User) {
  2650. auto Data = (struct PrintGPUUserData *)User;
  2651. auto Id = isl_ast_node_get_annotation(Node);
  2652. if (Id) {
  2653. bool IsUser = !strcmp(isl_id_get_name(Id), "user");
  2654. // If this is a user statement, format it ourselves as ppcg would
  2655. // otherwise try to call pet functionality that is not available in
  2656. // Polly.
  2657. if (IsUser) {
  2658. P = isl_printer_start_line(P);
  2659. P = isl_printer_print_ast_node(P, Node);
  2660. P = isl_printer_end_line(P);
  2661. isl_id_free(Id);
  2662. isl_ast_print_options_free(Options);
  2663. return P;
  2664. }
  2665. auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id);
  2666. isl_id_free(Id);
  2667. Data->Kernels.push_back(Kernel);
  2668. }
  2669. return print_host_user(P, Options, Node, User);
  2670. }
  2671. /// Print C code corresponding to the control flow in @p Kernel.
  2672. ///
  2673. /// @param Kernel The kernel to print
  2674. void printKernel(ppcg_kernel *Kernel) {
  2675. auto *P = isl_printer_to_str(S->getIslCtx().get());
  2676. P = isl_printer_set_output_format(P, ISL_FORMAT_C);
  2677. auto *Options = isl_ast_print_options_alloc(S->getIslCtx().get());
  2678. P = isl_ast_node_print(Kernel->tree, P, Options);
  2679. char *String = isl_printer_get_str(P);
  2680. outs() << String << "\n";
  2681. free(String);
  2682. isl_printer_free(P);
  2683. }
  2684. /// Print C code corresponding to the GPU code described by @p Tree.
  2685. ///
  2686. /// @param Tree An AST describing GPU code
  2687. /// @param PPCGProg The PPCG program from which @Tree has been constructed.
  2688. void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) {
  2689. auto *P = isl_printer_to_str(S->getIslCtx().get());
  2690. P = isl_printer_set_output_format(P, ISL_FORMAT_C);
  2691. PrintGPUUserData Data;
  2692. Data.PPCGProg = PPCGProg;
  2693. auto *Options = isl_ast_print_options_alloc(S->getIslCtx().get());
  2694. Options =
  2695. isl_ast_print_options_set_print_user(Options, printHostUser, &Data);
  2696. P = isl_ast_node_print(Tree, P, Options);
  2697. char *String = isl_printer_get_str(P);
  2698. outs() << "# host\n";
  2699. outs() << String << "\n";
  2700. free(String);
  2701. isl_printer_free(P);
  2702. for (auto Kernel : Data.Kernels) {
  2703. outs() << "# kernel" << Kernel->id << "\n";
  2704. printKernel(Kernel);
  2705. }
  2706. }
  2707. // Generate a GPU program using PPCG.
  2708. //
  2709. // GPU mapping consists of multiple steps:
  2710. //
  2711. // 1) Compute new schedule for the program.
  2712. // 2) Map schedule to GPU (TODO)
  2713. // 3) Generate code for new schedule (TODO)
  2714. //
  2715. // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer
  2716. // is mostly CPU specific. Instead, we use PPCG's GPU code generation
  2717. // strategy directly from this pass.
  2718. gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) {
  2719. auto PPCGGen = isl_calloc_type(S->getIslCtx().get(), struct gpu_gen);
  2720. PPCGGen->ctx = S->getIslCtx().get();
  2721. PPCGGen->options = PPCGScop->options;
  2722. PPCGGen->print = nullptr;
  2723. PPCGGen->print_user = nullptr;
  2724. PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt;
  2725. PPCGGen->prog = PPCGProg;
  2726. PPCGGen->tree = nullptr;
  2727. PPCGGen->types.n = 0;
  2728. PPCGGen->types.name = nullptr;
  2729. PPCGGen->sizes = nullptr;
  2730. PPCGGen->used_sizes = nullptr;
  2731. PPCGGen->kernel_id = 0;
  2732. // Set scheduling strategy to same strategy PPCG is using.
  2733. isl_options_set_schedule_serialize_sccs(PPCGGen->ctx, false);
  2734. isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true);
  2735. isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true);
  2736. isl_options_set_schedule_whole_component(PPCGGen->ctx, false);
  2737. isl_schedule *Schedule = get_schedule(PPCGGen);
  2738. int has_permutable = has_any_permutable_node(Schedule);
  2739. Schedule =
  2740. isl_schedule_align_params(Schedule, S->getFullParamSpace().release());
  2741. if (!has_permutable || has_permutable < 0) {
  2742. Schedule = isl_schedule_free(Schedule);
  2743. LLVM_DEBUG(dbgs() << getUniqueScopName(S)
  2744. << " does not have permutable bands. Bailing out\n";);
  2745. } else {
  2746. const bool CreateTransferToFromDevice = !PollyManagedMemory;
  2747. Schedule = map_to_device(PPCGGen, Schedule, CreateTransferToFromDevice);
  2748. PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule));
  2749. }
  2750. if (DumpSchedule) {
  2751. isl_printer *P = isl_printer_to_str(S->getIslCtx().get());
  2752. P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
  2753. P = isl_printer_print_str(P, "Schedule\n");
  2754. P = isl_printer_print_str(P, "========\n");
  2755. if (Schedule)
  2756. P = isl_printer_print_schedule(P, Schedule);
  2757. else
  2758. P = isl_printer_print_str(P, "No schedule found\n");
  2759. outs() << isl_printer_get_str(P) << "\n";
  2760. isl_printer_free(P);
  2761. }
  2762. if (DumpCode) {
  2763. outs() << "Code\n";
  2764. outs() << "====\n";
  2765. if (PPCGGen->tree)
  2766. printGPUTree(PPCGGen->tree, PPCGProg);
  2767. else
  2768. outs() << "No code generated\n";
  2769. }
  2770. isl_schedule_free(Schedule);
  2771. return PPCGGen;
  2772. }
  2773. /// Free gpu_gen structure.
  2774. ///
  2775. /// @param PPCGGen The ppcg_gen object to free.
  2776. void freePPCGGen(gpu_gen *PPCGGen) {
  2777. isl_ast_node_free(PPCGGen->tree);
  2778. isl_union_map_free(PPCGGen->sizes);
  2779. isl_union_map_free(PPCGGen->used_sizes);
  2780. free(PPCGGen);
  2781. }
  2782. /// Free the options in the ppcg scop structure.
  2783. ///
  2784. /// ppcg is not freeing these options for us. To avoid leaks we do this
  2785. /// ourselves.
  2786. ///
  2787. /// @param PPCGScop The scop referencing the options to free.
  2788. void freeOptions(ppcg_scop *PPCGScop) {
  2789. free(PPCGScop->options->debug);
  2790. PPCGScop->options->debug = nullptr;
  2791. free(PPCGScop->options);
  2792. PPCGScop->options = nullptr;
  2793. }
  2794. /// Approximate the number of points in the set.
  2795. ///
  2796. /// This function returns an ast expression that overapproximates the number
  2797. /// of points in an isl set through the rectangular hull surrounding this set.
  2798. ///
  2799. /// @param Set The set to count.
  2800. /// @param Build The isl ast build object to use for creating the ast
  2801. /// expression.
  2802. ///
  2803. /// @returns An approximation of the number of points in the set.
  2804. __isl_give isl_ast_expr *approxPointsInSet(__isl_take isl_set *Set,
  2805. __isl_keep isl_ast_build *Build) {
  2806. isl_val *One = isl_val_int_from_si(isl_set_get_ctx(Set), 1);
  2807. auto *Expr = isl_ast_expr_from_val(isl_val_copy(One));
  2808. isl_space *Space = isl_set_get_space(Set);
  2809. Space = isl_space_params(Space);
  2810. auto *Univ = isl_set_universe(Space);
  2811. isl_pw_aff *OneAff = isl_pw_aff_val_on_domain(Univ, One);
  2812. for (long i = 0, n = isl_set_dim(Set, isl_dim_set); i < n; i++) {
  2813. isl_pw_aff *Max = isl_set_dim_max(isl_set_copy(Set), i);
  2814. isl_pw_aff *Min = isl_set_dim_min(isl_set_copy(Set), i);
  2815. isl_pw_aff *DimSize = isl_pw_aff_sub(Max, Min);
  2816. DimSize = isl_pw_aff_add(DimSize, isl_pw_aff_copy(OneAff));
  2817. auto DimSizeExpr = isl_ast_build_expr_from_pw_aff(Build, DimSize);
  2818. Expr = isl_ast_expr_mul(Expr, DimSizeExpr);
  2819. }
  2820. isl_set_free(Set);
  2821. isl_pw_aff_free(OneAff);
  2822. return Expr;
  2823. }
  2824. /// Approximate a number of dynamic instructions executed by a given
  2825. /// statement.
  2826. ///
  2827. /// @param Stmt The statement for which to compute the number of dynamic
  2828. /// instructions.
  2829. /// @param Build The isl ast build object to use for creating the ast
  2830. /// expression.
  2831. /// @returns An approximation of the number of dynamic instructions executed
  2832. /// by @p Stmt.
  2833. __isl_give isl_ast_expr *approxDynamicInst(ScopStmt &Stmt,
  2834. __isl_keep isl_ast_build *Build) {
  2835. auto Iterations = approxPointsInSet(Stmt.getDomain().release(), Build);
  2836. long InstCount = 0;
  2837. if (Stmt.isBlockStmt()) {
  2838. auto *BB = Stmt.getBasicBlock();
  2839. InstCount = std::distance(BB->begin(), BB->end());
  2840. } else {
  2841. auto *R = Stmt.getRegion();
  2842. for (auto *BB : R->blocks()) {
  2843. InstCount += std::distance(BB->begin(), BB->end());
  2844. }
  2845. }
  2846. isl_val *InstVal = isl_val_int_from_si(S->getIslCtx().get(), InstCount);
  2847. auto *InstExpr = isl_ast_expr_from_val(InstVal);
  2848. return isl_ast_expr_mul(InstExpr, Iterations);
  2849. }
  2850. /// Approximate dynamic instructions executed in scop.
  2851. ///
  2852. /// @param S The scop for which to approximate dynamic instructions.
  2853. /// @param Build The isl ast build object to use for creating the ast
  2854. /// expression.
  2855. /// @returns An approximation of the number of dynamic instructions executed
  2856. /// in @p S.
  2857. __isl_give isl_ast_expr *
  2858. getNumberOfIterations(Scop &S, __isl_keep isl_ast_build *Build) {
  2859. isl_ast_expr *Instructions;
  2860. isl_val *Zero = isl_val_int_from_si(S.getIslCtx().get(), 0);
  2861. Instructions = isl_ast_expr_from_val(Zero);
  2862. for (ScopStmt &Stmt : S) {
  2863. isl_ast_expr *StmtInstructions = approxDynamicInst(Stmt, Build);
  2864. Instructions = isl_ast_expr_add(Instructions, StmtInstructions);
  2865. }
  2866. return Instructions;
  2867. }
  2868. /// Create a check that ensures sufficient compute in scop.
  2869. ///
  2870. /// @param S The scop for which to ensure sufficient compute.
  2871. /// @param Build The isl ast build object to use for creating the ast
  2872. /// expression.
  2873. /// @returns An expression that evaluates to TRUE in case of sufficient
  2874. /// compute and to FALSE, otherwise.
  2875. __isl_give isl_ast_expr *
  2876. createSufficientComputeCheck(Scop &S, __isl_keep isl_ast_build *Build) {
  2877. auto Iterations = getNumberOfIterations(S, Build);
  2878. auto *MinComputeVal = isl_val_int_from_si(S.getIslCtx().get(), MinCompute);
  2879. auto *MinComputeExpr = isl_ast_expr_from_val(MinComputeVal);
  2880. return isl_ast_expr_ge(Iterations, MinComputeExpr);
  2881. }
  2882. /// Check if the basic block contains a function we cannot codegen for GPU
  2883. /// kernels.
  2884. ///
  2885. /// If this basic block does something with a `Function` other than calling
  2886. /// a function that we support in a kernel, return true.
  2887. bool containsInvalidKernelFunctionInBlock(const BasicBlock *BB,
  2888. bool AllowCUDALibDevice) {
  2889. for (const Instruction &Inst : *BB) {
  2890. const CallInst *Call = dyn_cast<CallInst>(&Inst);
  2891. if (Call && isValidFunctionInKernel(Call->getCalledFunction(),
  2892. AllowCUDALibDevice))
  2893. continue;
  2894. for (Value *Op : Inst.operands())
  2895. // Look for functions among operands of Inst.
  2896. if (isa<Function>(Op->stripPointerCasts())) {
  2897. LLVM_DEBUG(dbgs()
  2898. << Inst << " has illegal use of function in kernel.\n");
  2899. return true;
  2900. }
  2901. }
  2902. return false;
  2903. }
  2904. /// Return whether the Scop S uses functions in a way that we do not support.
  2905. bool containsInvalidKernelFunction(const Scop &S, bool AllowCUDALibDevice) {
  2906. for (auto &Stmt : S) {
  2907. if (Stmt.isBlockStmt()) {
  2908. if (containsInvalidKernelFunctionInBlock(Stmt.getBasicBlock(),
  2909. AllowCUDALibDevice))
  2910. return true;
  2911. } else {
  2912. assert(Stmt.isRegionStmt() &&
  2913. "Stmt was neither block nor region statement");
  2914. for (const BasicBlock *BB : Stmt.getRegion()->blocks())
  2915. if (containsInvalidKernelFunctionInBlock(BB, AllowCUDALibDevice))
  2916. return true;
  2917. }
  2918. }
  2919. return false;
  2920. }
  2921. /// Generate code for a given GPU AST described by @p Root.
  2922. ///
  2923. /// @param Root An isl_ast_node pointing to the root of the GPU AST.
  2924. /// @param Prog The GPU Program to generate code for.
  2925. void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) {
  2926. ScopAnnotator Annotator;
  2927. Annotator.buildAliasScopes(*S);
  2928. Region *R = &S->getRegion();
  2929. simplifyRegion(R, DT, LI, RI);
  2930. BasicBlock *EnteringBB = R->getEnteringBlock();
  2931. PollyIRBuilder Builder(EnteringBB->getContext(), ConstantFolder(),
  2932. IRInserter(Annotator));
  2933. Builder.SetInsertPoint(EnteringBB->getTerminator());
  2934. // Only build the run-time condition and parameters _after_ having
  2935. // introduced the conditional branch. This is important as the conditional
  2936. // branch will guard the original scop from new induction variables that
  2937. // the SCEVExpander may introduce while code generating the parameters and
  2938. // which may introduce scalar dependences that prevent us from correctly
  2939. // code generating this scop.
  2940. BBPair StartExitBlocks;
  2941. BranchInst *CondBr = nullptr;
  2942. std::tie(StartExitBlocks, CondBr) =
  2943. executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI);
  2944. BasicBlock *StartBlock = std::get<0>(StartExitBlocks);
  2945. assert(CondBr && "CondBr not initialized by executeScopConditionally");
  2946. GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S,
  2947. StartBlock, Prog, Runtime, Architecture);
  2948. // TODO: Handle LICM
  2949. auto SplitBlock = StartBlock->getSinglePredecessor();
  2950. Builder.SetInsertPoint(SplitBlock->getTerminator());
  2951. isl_ast_build *Build = isl_ast_build_alloc(S->getIslCtx().get());
  2952. isl::ast_expr Condition =
  2953. IslAst::buildRunCondition(*S, isl::manage_copy(Build));
  2954. isl_ast_expr *SufficientCompute = createSufficientComputeCheck(*S, Build);
  2955. Condition =
  2956. isl::manage(isl_ast_expr_and(Condition.release(), SufficientCompute));
  2957. isl_ast_build_free(Build);
  2958. // preload invariant loads. Note: This should happen before the RTC
  2959. // because the RTC may depend on values that are invariant load hoisted.
  2960. if (!NodeBuilder.preloadInvariantLoads()) {
  2961. // Patch the introduced branch condition to ensure that we always execute
  2962. // the original SCoP.
  2963. auto *FalseI1 = Builder.getFalse();
  2964. auto *SplitBBTerm = Builder.GetInsertBlock()->getTerminator();
  2965. SplitBBTerm->setOperand(0, FalseI1);
  2966. LLVM_DEBUG(dbgs() << "preloading invariant loads failed in function: " +
  2967. S->getFunction().getName() +
  2968. " | Scop Region: " + S->getNameStr());
  2969. // adjust the dominator tree accordingly.
  2970. auto *ExitingBlock = StartBlock->getUniqueSuccessor();
  2971. assert(ExitingBlock);
  2972. auto *MergeBlock = ExitingBlock->getUniqueSuccessor();
  2973. assert(MergeBlock);
  2974. polly::markBlockUnreachable(*StartBlock, Builder);
  2975. polly::markBlockUnreachable(*ExitingBlock, Builder);
  2976. auto *ExitingBB = S->getExitingBlock();
  2977. assert(ExitingBB);
  2978. DT->changeImmediateDominator(MergeBlock, ExitingBB);
  2979. DT->eraseNode(ExitingBlock);
  2980. isl_ast_node_free(Root);
  2981. } else {
  2982. if (polly::PerfMonitoring) {
  2983. PerfMonitor P(*S, EnteringBB->getParent()->getParent());
  2984. P.initialize();
  2985. P.insertRegionStart(SplitBlock->getTerminator());
  2986. // TODO: actually think if this is the correct exiting block to place
  2987. // the `end` performance marker. Invariant load hoisting changes
  2988. // the CFG in a way that I do not precisely understand, so I
  2989. // (Siddharth<siddu.druid@gmail.com>) should come back to this and
  2990. // think about which exiting block to use.
  2991. auto *ExitingBlock = StartBlock->getUniqueSuccessor();
  2992. assert(ExitingBlock);
  2993. BasicBlock *MergeBlock = ExitingBlock->getUniqueSuccessor();
  2994. P.insertRegionEnd(MergeBlock->getTerminator());
  2995. }
  2996. NodeBuilder.addParameters(S->getContext().release());
  2997. Value *RTC = NodeBuilder.createRTC(Condition.release());
  2998. Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC);
  2999. Builder.SetInsertPoint(&*StartBlock->begin());
  3000. NodeBuilder.create(Root);
  3001. }
  3002. /// In case a sequential kernel has more surrounding loops as any parallel
  3003. /// kernel, the SCoP is probably mostly sequential. Hence, there is no
  3004. /// point in running it on a GPU.
  3005. if (NodeBuilder.DeepestSequential > NodeBuilder.DeepestParallel)
  3006. CondBr->setOperand(0, Builder.getFalse());
  3007. if (!NodeBuilder.BuildSuccessful)
  3008. CondBr->setOperand(0, Builder.getFalse());
  3009. }
  3010. bool runOnScop(Scop &CurrentScop) override {
  3011. S = &CurrentScop;
  3012. LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  3013. DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  3014. SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  3015. DL = &S->getRegion().getEntry()->getModule()->getDataLayout();
  3016. RI = &getAnalysis<RegionInfoPass>().getRegionInfo();
  3017. LLVM_DEBUG(dbgs() << "PPCGCodeGen running on : " << getUniqueScopName(S)
  3018. << " | loop depth: " << S->getMaxLoopDepth() << "\n");
  3019. // We currently do not support functions other than intrinsics inside
  3020. // kernels, as code generation will need to offload function calls to the
  3021. // kernel. This may lead to a kernel trying to call a function on the host.
  3022. // This also allows us to prevent codegen from trying to take the
  3023. // address of an intrinsic function to send to the kernel.
  3024. if (containsInvalidKernelFunction(CurrentScop,
  3025. Architecture == GPUArch::NVPTX64)) {
  3026. LLVM_DEBUG(
  3027. dbgs() << getUniqueScopName(S)
  3028. << " contains function which cannot be materialised in a GPU "
  3029. "kernel. Bailing out.\n";);
  3030. return false;
  3031. }
  3032. auto PPCGScop = createPPCGScop();
  3033. auto PPCGProg = createPPCGProg(PPCGScop);
  3034. auto PPCGGen = generateGPU(PPCGScop, PPCGProg);
  3035. if (PPCGGen->tree) {
  3036. generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg);
  3037. CurrentScop.markAsToBeSkipped();
  3038. } else {
  3039. LLVM_DEBUG(dbgs() << getUniqueScopName(S)
  3040. << " has empty PPCGGen->tree. Bailing out.\n");
  3041. }
  3042. freeOptions(PPCGScop);
  3043. freePPCGGen(PPCGGen);
  3044. gpu_prog_free(PPCGProg);
  3045. ppcg_scop_free(PPCGScop);
  3046. return true;
  3047. }
  3048. void printScop(raw_ostream &, Scop &) const override {}
  3049. void getAnalysisUsage(AnalysisUsage &AU) const override {
  3050. ScopPass::getAnalysisUsage(AU);
  3051. AU.addRequired<DominatorTreeWrapperPass>();
  3052. AU.addRequired<RegionInfoPass>();
  3053. AU.addRequired<ScalarEvolutionWrapperPass>();
  3054. AU.addRequired<ScopDetectionWrapperPass>();
  3055. AU.addRequired<ScopInfoRegionPass>();
  3056. AU.addRequired<LoopInfoWrapperPass>();
  3057. // FIXME: We do not yet add regions for the newly generated code to the
  3058. // region tree.
  3059. }
  3060. };
  3061. } // namespace
  3062. char PPCGCodeGeneration::ID = 1;
  3063. Pass *polly::createPPCGCodeGenerationPass(GPUArch Arch, GPURuntime Runtime) {
  3064. PPCGCodeGeneration *generator = new PPCGCodeGeneration();
  3065. generator->Runtime = Runtime;
  3066. generator->Architecture = Arch;
  3067. return generator;
  3068. }
  3069. INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg",
  3070. "Polly - Apply PPCG translation to SCOP", false, false)
  3071. INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
  3072. INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
  3073. INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
  3074. INITIALIZE_PASS_DEPENDENCY(RegionInfoPass);
  3075. INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass);
  3076. INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass);
  3077. INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg",
  3078. "Polly - Apply PPCG translation to SCOP", false, false)