CGCUDANV.cpp 51 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219
  1. //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This provides a class for CUDA code generation targeting the NVIDIA CUDA
  10. // runtime library.
  11. //
  12. //===----------------------------------------------------------------------===//
  13. #include "CGCUDARuntime.h"
  14. #include "CGCXXABI.h"
  15. #include "CodeGenFunction.h"
  16. #include "CodeGenModule.h"
  17. #include "clang/AST/Decl.h"
  18. #include "clang/Basic/Cuda.h"
  19. #include "clang/CodeGen/CodeGenABITypes.h"
  20. #include "clang/CodeGen/ConstantInitBuilder.h"
  21. #include "llvm/IR/BasicBlock.h"
  22. #include "llvm/IR/Constants.h"
  23. #include "llvm/IR/DerivedTypes.h"
  24. #include "llvm/IR/ReplaceConstant.h"
  25. #include "llvm/Support/Format.h"
  26. using namespace clang;
  27. using namespace CodeGen;
  28. namespace {
  29. constexpr unsigned CudaFatMagic = 0x466243b1;
  30. constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
  31. class CGNVCUDARuntime : public CGCUDARuntime {
  32. private:
  33. llvm::IntegerType *IntTy, *SizeTy;
  34. llvm::Type *VoidTy;
  35. llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
  36. /// Convenience reference to LLVM Context
  37. llvm::LLVMContext &Context;
  38. /// Convenience reference to the current module
  39. llvm::Module &TheModule;
  40. /// Keeps track of kernel launch stubs and handles emitted in this module
  41. struct KernelInfo {
  42. llvm::Function *Kernel; // stub function to help launch kernel
  43. const Decl *D;
  44. };
  45. llvm::SmallVector<KernelInfo, 16> EmittedKernels;
  46. // Map a kernel mangled name to a symbol for identifying kernel in host code
  47. // For CUDA, the symbol for identifying the kernel is the same as the device
  48. // stub function. For HIP, they are different.
  49. llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
  50. // Map a kernel handle to the kernel stub.
  51. llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
  52. struct VarInfo {
  53. llvm::GlobalVariable *Var;
  54. const VarDecl *D;
  55. DeviceVarFlags Flags;
  56. };
  57. llvm::SmallVector<VarInfo, 16> DeviceVars;
  58. /// Keeps track of variable containing handle of GPU binary. Populated by
  59. /// ModuleCtorFunction() and used to create corresponding cleanup calls in
  60. /// ModuleDtorFunction()
  61. llvm::GlobalVariable *GpuBinaryHandle = nullptr;
  62. /// Whether we generate relocatable device code.
  63. bool RelocatableDeviceCode;
  64. /// Mangle context for device.
  65. std::unique_ptr<MangleContext> DeviceMC;
  66. /// Some zeros used for GEPs.
  67. llvm::Constant *Zeros[2];
  68. llvm::FunctionCallee getSetupArgumentFn() const;
  69. llvm::FunctionCallee getLaunchFn() const;
  70. llvm::FunctionType *getRegisterGlobalsFnTy() const;
  71. llvm::FunctionType *getCallbackFnTy() const;
  72. llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
  73. std::string addPrefixToName(StringRef FuncName) const;
  74. std::string addUnderscoredPrefixToName(StringRef FuncName) const;
  75. /// Creates a function to register all kernel stubs generated in this module.
  76. llvm::Function *makeRegisterGlobalsFn();
  77. /// Helper function that generates a constant string and returns a pointer to
  78. /// the start of the string. The result of this function can be used anywhere
  79. /// where the C code specifies const char*.
  80. llvm::Constant *makeConstantString(const std::string &Str,
  81. const std::string &Name = "") {
  82. auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
  83. return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
  84. ConstStr.getPointer(), Zeros);
  85. }
  86. /// Helper function which generates an initialized constant array from Str,
  87. /// and optionally sets section name and alignment. AddNull specifies whether
  88. /// the array should nave NUL termination.
  89. llvm::Constant *makeConstantArray(StringRef Str,
  90. StringRef Name = "",
  91. StringRef SectionName = "",
  92. unsigned Alignment = 0,
  93. bool AddNull = false) {
  94. llvm::Constant *Value =
  95. llvm::ConstantDataArray::getString(Context, Str, AddNull);
  96. auto *GV = new llvm::GlobalVariable(
  97. TheModule, Value->getType(), /*isConstant=*/true,
  98. llvm::GlobalValue::PrivateLinkage, Value, Name);
  99. if (!SectionName.empty()) {
  100. GV->setSection(SectionName);
  101. // Mark the address as used which make sure that this section isn't
  102. // merged and we will really have it in the object file.
  103. GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
  104. }
  105. if (Alignment)
  106. GV->setAlignment(llvm::Align(Alignment));
  107. return llvm::ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zeros);
  108. }
  109. /// Helper function that generates an empty dummy function returning void.
  110. llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
  111. assert(FnTy->getReturnType()->isVoidTy() &&
  112. "Can only generate dummy functions returning void!");
  113. llvm::Function *DummyFunc = llvm::Function::Create(
  114. FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
  115. llvm::BasicBlock *DummyBlock =
  116. llvm::BasicBlock::Create(Context, "", DummyFunc);
  117. CGBuilderTy FuncBuilder(CGM, Context);
  118. FuncBuilder.SetInsertPoint(DummyBlock);
  119. FuncBuilder.CreateRetVoid();
  120. return DummyFunc;
  121. }
  122. void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
  123. void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
  124. std::string getDeviceSideName(const NamedDecl *ND) override;
  125. void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
  126. bool Extern, bool Constant) {
  127. DeviceVars.push_back({&Var,
  128. VD,
  129. {DeviceVarFlags::Variable, Extern, Constant,
  130. VD->hasAttr<HIPManagedAttr>(),
  131. /*Normalized*/ false, 0}});
  132. }
  133. void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
  134. bool Extern, int Type) {
  135. DeviceVars.push_back({&Var,
  136. VD,
  137. {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
  138. /*Managed*/ false,
  139. /*Normalized*/ false, Type}});
  140. }
  141. void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
  142. bool Extern, int Type, bool Normalized) {
  143. DeviceVars.push_back({&Var,
  144. VD,
  145. {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
  146. /*Managed*/ false, Normalized, Type}});
  147. }
  148. /// Creates module constructor function
  149. llvm::Function *makeModuleCtorFunction();
  150. /// Creates module destructor function
  151. llvm::Function *makeModuleDtorFunction();
  152. /// Transform managed variables for device compilation.
  153. void transformManagedVars();
  154. /// Create offloading entries to register globals in RDC mode.
  155. void createOffloadingEntries();
  156. public:
  157. CGNVCUDARuntime(CodeGenModule &CGM);
  158. llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
  159. llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
  160. auto Loc = KernelStubs.find(Handle);
  161. assert(Loc != KernelStubs.end());
  162. return Loc->second;
  163. }
  164. void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
  165. void handleVarRegistration(const VarDecl *VD,
  166. llvm::GlobalVariable &Var) override;
  167. void
  168. internalizeDeviceSideVar(const VarDecl *D,
  169. llvm::GlobalValue::LinkageTypes &Linkage) override;
  170. llvm::Function *finalizeModule() override;
  171. };
  172. } // end anonymous namespace
  173. std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
  174. if (CGM.getLangOpts().HIP)
  175. return ((Twine("hip") + Twine(FuncName)).str());
  176. return ((Twine("cuda") + Twine(FuncName)).str());
  177. }
  178. std::string
  179. CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
  180. if (CGM.getLangOpts().HIP)
  181. return ((Twine("__hip") + Twine(FuncName)).str());
  182. return ((Twine("__cuda") + Twine(FuncName)).str());
  183. }
  184. static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
  185. // If the host and device have different C++ ABIs, mark it as the device
  186. // mangle context so that the mangling needs to retrieve the additional
  187. // device lambda mangling number instead of the regular host one.
  188. if (CGM.getContext().getAuxTargetInfo() &&
  189. CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
  190. CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) {
  191. return std::unique_ptr<MangleContext>(
  192. CGM.getContext().createDeviceMangleContext(
  193. *CGM.getContext().getAuxTargetInfo()));
  194. }
  195. return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
  196. CGM.getContext().getAuxTargetInfo()));
  197. }
  198. CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
  199. : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
  200. TheModule(CGM.getModule()),
  201. RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
  202. DeviceMC(InitDeviceMC(CGM)) {
  203. CodeGen::CodeGenTypes &Types = CGM.getTypes();
  204. ASTContext &Ctx = CGM.getContext();
  205. IntTy = CGM.IntTy;
  206. SizeTy = CGM.SizeTy;
  207. VoidTy = CGM.VoidTy;
  208. Zeros[0] = llvm::ConstantInt::get(SizeTy, 0);
  209. Zeros[1] = Zeros[0];
  210. CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
  211. VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
  212. VoidPtrPtrTy = VoidPtrTy->getPointerTo();
  213. }
  214. llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
  215. // cudaError_t cudaSetupArgument(void *, size_t, size_t)
  216. llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
  217. return CGM.CreateRuntimeFunction(
  218. llvm::FunctionType::get(IntTy, Params, false),
  219. addPrefixToName("SetupArgument"));
  220. }
  221. llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
  222. if (CGM.getLangOpts().HIP) {
  223. // hipError_t hipLaunchByPtr(char *);
  224. return CGM.CreateRuntimeFunction(
  225. llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
  226. }
  227. // cudaError_t cudaLaunch(char *);
  228. return CGM.CreateRuntimeFunction(
  229. llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
  230. }
  231. llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
  232. return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
  233. }
  234. llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
  235. return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
  236. }
  237. llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
  238. auto *CallbackFnTy = getCallbackFnTy();
  239. auto *RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
  240. llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
  241. VoidPtrTy, CallbackFnTy->getPointerTo()};
  242. return llvm::FunctionType::get(VoidTy, Params, false);
  243. }
  244. std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
  245. GlobalDecl GD;
  246. // D could be either a kernel or a variable.
  247. if (auto *FD = dyn_cast<FunctionDecl>(ND))
  248. GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
  249. else
  250. GD = GlobalDecl(ND);
  251. std::string DeviceSideName;
  252. MangleContext *MC;
  253. if (CGM.getLangOpts().CUDAIsDevice)
  254. MC = &CGM.getCXXABI().getMangleContext();
  255. else
  256. MC = DeviceMC.get();
  257. if (MC->shouldMangleDeclName(ND)) {
  258. SmallString<256> Buffer;
  259. llvm::raw_svector_ostream Out(Buffer);
  260. MC->mangleName(GD, Out);
  261. DeviceSideName = std::string(Out.str());
  262. } else
  263. DeviceSideName = std::string(ND->getIdentifier()->getName());
  264. // Make unique name for device side static file-scope variable for HIP.
  265. if (CGM.getContext().shouldExternalize(ND) &&
  266. CGM.getLangOpts().GPURelocatableDeviceCode) {
  267. SmallString<256> Buffer;
  268. llvm::raw_svector_ostream Out(Buffer);
  269. Out << DeviceSideName;
  270. CGM.printPostfixForExternalizedDecl(Out, ND);
  271. DeviceSideName = std::string(Out.str());
  272. }
  273. return DeviceSideName;
  274. }
  275. void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
  276. FunctionArgList &Args) {
  277. EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
  278. if (auto *GV =
  279. dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn->getName()])) {
  280. GV->setLinkage(CGF.CurFn->getLinkage());
  281. GV->setInitializer(CGF.CurFn);
  282. }
  283. if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
  284. CudaFeature::CUDA_USES_NEW_LAUNCH) ||
  285. (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
  286. emitDeviceStubBodyNew(CGF, Args);
  287. else
  288. emitDeviceStubBodyLegacy(CGF, Args);
  289. }
  290. // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
  291. // array and kernels are launched using cudaLaunchKernel().
  292. void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
  293. FunctionArgList &Args) {
  294. // Build the shadow stack entry at the very start of the function.
  295. // Calculate amount of space we will need for all arguments. If we have no
  296. // args, allocate a single pointer so we still have a valid pointer to the
  297. // argument array that we can pass to runtime, even if it will be unused.
  298. Address KernelArgs = CGF.CreateTempAlloca(
  299. VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
  300. llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
  301. // Store pointers to the arguments in a locally allocated launch_args.
  302. for (unsigned i = 0; i < Args.size(); ++i) {
  303. llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
  304. llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
  305. CGF.Builder.CreateDefaultAlignedStore(
  306. VoidVarPtr,
  307. CGF.Builder.CreateConstGEP1_32(VoidPtrTy, KernelArgs.getPointer(), i));
  308. }
  309. llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
  310. // Lookup cudaLaunchKernel/hipLaunchKernel function.
  311. // HIP kernel launching API name depends on -fgpu-default-stream option. For
  312. // the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
  313. // it is hipLaunchKernel_spt.
  314. // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
  315. // void **args, size_t sharedMem,
  316. // cudaStream_t stream);
  317. // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
  318. // dim3 blockDim, void **args,
  319. // size_t sharedMem, hipStream_t stream);
  320. TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
  321. DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
  322. std::string KernelLaunchAPI = "LaunchKernel";
  323. if (CGF.getLangOpts().HIP && CGF.getLangOpts().GPUDefaultStream ==
  324. LangOptions::GPUDefaultStreamKind::PerThread)
  325. KernelLaunchAPI = KernelLaunchAPI + "_spt";
  326. auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
  327. IdentifierInfo &cudaLaunchKernelII =
  328. CGM.getContext().Idents.get(LaunchKernelName);
  329. FunctionDecl *cudaLaunchKernelFD = nullptr;
  330. for (auto *Result : DC->lookup(&cudaLaunchKernelII)) {
  331. if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
  332. cudaLaunchKernelFD = FD;
  333. }
  334. if (cudaLaunchKernelFD == nullptr) {
  335. CGM.Error(CGF.CurFuncDecl->getLocation(),
  336. "Can't find declaration for " + LaunchKernelName);
  337. return;
  338. }
  339. // Create temporary dim3 grid_dim, block_dim.
  340. ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
  341. QualType Dim3Ty = GridDimParam->getType();
  342. Address GridDim =
  343. CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
  344. Address BlockDim =
  345. CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
  346. Address ShmemSize =
  347. CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
  348. Address Stream =
  349. CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
  350. llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
  351. llvm::FunctionType::get(IntTy,
  352. {/*gridDim=*/GridDim.getType(),
  353. /*blockDim=*/BlockDim.getType(),
  354. /*ShmemSize=*/ShmemSize.getType(),
  355. /*Stream=*/Stream.getType()},
  356. /*isVarArg=*/false),
  357. addUnderscoredPrefixToName("PopCallConfiguration"));
  358. CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
  359. {GridDim.getPointer(), BlockDim.getPointer(),
  360. ShmemSize.getPointer(), Stream.getPointer()});
  361. // Emit the call to cudaLaunch
  362. llvm::Value *Kernel = CGF.Builder.CreatePointerCast(
  363. KernelHandles[CGF.CurFn->getName()], VoidPtrTy);
  364. CallArgList LaunchKernelArgs;
  365. LaunchKernelArgs.add(RValue::get(Kernel),
  366. cudaLaunchKernelFD->getParamDecl(0)->getType());
  367. LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
  368. LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
  369. LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
  370. cudaLaunchKernelFD->getParamDecl(3)->getType());
  371. LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
  372. cudaLaunchKernelFD->getParamDecl(4)->getType());
  373. LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
  374. cudaLaunchKernelFD->getParamDecl(5)->getType());
  375. QualType QT = cudaLaunchKernelFD->getType();
  376. QualType CQT = QT.getCanonicalType();
  377. llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
  378. llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
  379. const CGFunctionInfo &FI =
  380. CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
  381. llvm::FunctionCallee cudaLaunchKernelFn =
  382. CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
  383. CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
  384. LaunchKernelArgs);
  385. CGF.EmitBranch(EndBlock);
  386. CGF.EmitBlock(EndBlock);
  387. }
  388. void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
  389. FunctionArgList &Args) {
  390. // Emit a call to cudaSetupArgument for each arg in Args.
  391. llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
  392. llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
  393. CharUnits Offset = CharUnits::Zero();
  394. for (const VarDecl *A : Args) {
  395. auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
  396. Offset = Offset.alignTo(TInfo.Align);
  397. llvm::Value *Args[] = {
  398. CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
  399. VoidPtrTy),
  400. llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
  401. llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
  402. };
  403. llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
  404. llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
  405. llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
  406. llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
  407. CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
  408. CGF.EmitBlock(NextBlock);
  409. Offset += TInfo.Width;
  410. }
  411. // Emit the call to cudaLaunch
  412. llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
  413. llvm::Value *Arg = CGF.Builder.CreatePointerCast(
  414. KernelHandles[CGF.CurFn->getName()], CharPtrTy);
  415. CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
  416. CGF.EmitBranch(EndBlock);
  417. CGF.EmitBlock(EndBlock);
  418. }
  419. // Replace the original variable Var with the address loaded from variable
  420. // ManagedVar populated by HIP runtime.
  421. static void replaceManagedVar(llvm::GlobalVariable *Var,
  422. llvm::GlobalVariable *ManagedVar) {
  423. SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
  424. for (auto &&VarUse : Var->uses()) {
  425. WorkList.push_back({VarUse.getUser()});
  426. }
  427. while (!WorkList.empty()) {
  428. auto &&WorkItem = WorkList.pop_back_val();
  429. auto *U = WorkItem.back();
  430. if (isa<llvm::ConstantExpr>(U)) {
  431. for (auto &&UU : U->uses()) {
  432. WorkItem.push_back(UU.getUser());
  433. WorkList.push_back(WorkItem);
  434. WorkItem.pop_back();
  435. }
  436. continue;
  437. }
  438. if (auto *I = dyn_cast<llvm::Instruction>(U)) {
  439. llvm::Value *OldV = Var;
  440. llvm::Instruction *NewV =
  441. new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
  442. llvm::Align(Var->getAlignment()), I);
  443. WorkItem.pop_back();
  444. // Replace constant expressions directly or indirectly using the managed
  445. // variable with instructions.
  446. for (auto &&Op : WorkItem) {
  447. auto *CE = cast<llvm::ConstantExpr>(Op);
  448. auto *NewInst = CE->getAsInstruction(I);
  449. NewInst->replaceUsesOfWith(OldV, NewV);
  450. OldV = CE;
  451. NewV = NewInst;
  452. }
  453. I->replaceUsesOfWith(OldV, NewV);
  454. } else {
  455. llvm_unreachable("Invalid use of managed variable");
  456. }
  457. }
  458. }
  459. /// Creates a function that sets up state on the host side for CUDA objects that
  460. /// have a presence on both the host and device sides. Specifically, registers
  461. /// the host side of kernel functions and device global variables with the CUDA
  462. /// runtime.
  463. /// \code
  464. /// void __cuda_register_globals(void** GpuBinaryHandle) {
  465. /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
  466. /// ...
  467. /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
  468. /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
  469. /// ...
  470. /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
  471. /// }
  472. /// \endcode
  473. llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
  474. // No need to register anything
  475. if (EmittedKernels.empty() && DeviceVars.empty())
  476. return nullptr;
  477. llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
  478. getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
  479. addUnderscoredPrefixToName("_register_globals"), &TheModule);
  480. llvm::BasicBlock *EntryBB =
  481. llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
  482. CGBuilderTy Builder(CGM, Context);
  483. Builder.SetInsertPoint(EntryBB);
  484. // void __cudaRegisterFunction(void **, const char *, char *, const char *,
  485. // int, uint3*, uint3*, dim3*, dim3*, int*)
  486. llvm::Type *RegisterFuncParams[] = {
  487. VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
  488. VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
  489. llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
  490. llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
  491. addUnderscoredPrefixToName("RegisterFunction"));
  492. // Extract GpuBinaryHandle passed as the first argument passed to
  493. // __cuda_register_globals() and generate __cudaRegisterFunction() call for
  494. // each emitted kernel.
  495. llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
  496. for (auto &&I : EmittedKernels) {
  497. llvm::Constant *KernelName =
  498. makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
  499. llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
  500. llvm::Value *Args[] = {
  501. &GpuBinaryHandlePtr,
  502. Builder.CreateBitCast(KernelHandles[I.Kernel->getName()], VoidPtrTy),
  503. KernelName,
  504. KernelName,
  505. llvm::ConstantInt::get(IntTy, -1),
  506. NullPtr,
  507. NullPtr,
  508. NullPtr,
  509. NullPtr,
  510. llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
  511. Builder.CreateCall(RegisterFunc, Args);
  512. }
  513. llvm::Type *VarSizeTy = IntTy;
  514. // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
  515. if (CGM.getLangOpts().HIP ||
  516. ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
  517. VarSizeTy = SizeTy;
  518. // void __cudaRegisterVar(void **, char *, char *, const char *,
  519. // int, int, int, int)
  520. llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
  521. CharPtrTy, IntTy, VarSizeTy,
  522. IntTy, IntTy};
  523. llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
  524. llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
  525. addUnderscoredPrefixToName("RegisterVar"));
  526. // void __hipRegisterManagedVar(void **, char *, char *, const char *,
  527. // size_t, unsigned)
  528. llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
  529. CharPtrTy, VarSizeTy, IntTy};
  530. llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
  531. llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
  532. addUnderscoredPrefixToName("RegisterManagedVar"));
  533. // void __cudaRegisterSurface(void **, const struct surfaceReference *,
  534. // const void **, const char *, int, int);
  535. llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
  536. llvm::FunctionType::get(
  537. VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
  538. false),
  539. addUnderscoredPrefixToName("RegisterSurface"));
  540. // void __cudaRegisterTexture(void **, const struct textureReference *,
  541. // const void **, const char *, int, int, int)
  542. llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
  543. llvm::FunctionType::get(
  544. VoidTy,
  545. {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
  546. false),
  547. addUnderscoredPrefixToName("RegisterTexture"));
  548. for (auto &&Info : DeviceVars) {
  549. llvm::GlobalVariable *Var = Info.Var;
  550. assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
  551. "External variables should not show up here, except HIP managed "
  552. "variables");
  553. llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
  554. switch (Info.Flags.getKind()) {
  555. case DeviceVarFlags::Variable: {
  556. uint64_t VarSize =
  557. CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
  558. if (Info.Flags.isManaged()) {
  559. auto *ManagedVar = new llvm::GlobalVariable(
  560. CGM.getModule(), Var->getType(),
  561. /*isConstant=*/false, Var->getLinkage(),
  562. /*Init=*/Var->isDeclaration()
  563. ? nullptr
  564. : llvm::ConstantPointerNull::get(Var->getType()),
  565. /*Name=*/"", /*InsertBefore=*/nullptr,
  566. llvm::GlobalVariable::NotThreadLocal);
  567. ManagedVar->setDSOLocal(Var->isDSOLocal());
  568. ManagedVar->setVisibility(Var->getVisibility());
  569. ManagedVar->setExternallyInitialized(true);
  570. ManagedVar->takeName(Var);
  571. Var->setName(Twine(ManagedVar->getName() + ".managed"));
  572. replaceManagedVar(Var, ManagedVar);
  573. llvm::Value *Args[] = {
  574. &GpuBinaryHandlePtr,
  575. Builder.CreateBitCast(ManagedVar, VoidPtrTy),
  576. Builder.CreateBitCast(Var, VoidPtrTy),
  577. VarName,
  578. llvm::ConstantInt::get(VarSizeTy, VarSize),
  579. llvm::ConstantInt::get(IntTy, Var->getAlignment())};
  580. if (!Var->isDeclaration())
  581. Builder.CreateCall(RegisterManagedVar, Args);
  582. } else {
  583. llvm::Value *Args[] = {
  584. &GpuBinaryHandlePtr,
  585. Builder.CreateBitCast(Var, VoidPtrTy),
  586. VarName,
  587. VarName,
  588. llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
  589. llvm::ConstantInt::get(VarSizeTy, VarSize),
  590. llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
  591. llvm::ConstantInt::get(IntTy, 0)};
  592. Builder.CreateCall(RegisterVar, Args);
  593. }
  594. break;
  595. }
  596. case DeviceVarFlags::Surface:
  597. Builder.CreateCall(
  598. RegisterSurf,
  599. {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
  600. VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
  601. llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
  602. break;
  603. case DeviceVarFlags::Texture:
  604. Builder.CreateCall(
  605. RegisterTex,
  606. {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
  607. VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
  608. llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
  609. llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
  610. break;
  611. }
  612. }
  613. Builder.CreateRetVoid();
  614. return RegisterKernelsFunc;
  615. }
  616. /// Creates a global constructor function for the module:
  617. ///
  618. /// For CUDA:
  619. /// \code
  620. /// void __cuda_module_ctor() {
  621. /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
  622. /// __cuda_register_globals(Handle);
  623. /// }
  624. /// \endcode
  625. ///
  626. /// For HIP:
  627. /// \code
  628. /// void __hip_module_ctor() {
  629. /// if (__hip_gpubin_handle == 0) {
  630. /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
  631. /// __hip_register_globals(__hip_gpubin_handle);
  632. /// }
  633. /// }
  634. /// \endcode
  635. llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
  636. bool IsHIP = CGM.getLangOpts().HIP;
  637. bool IsCUDA = CGM.getLangOpts().CUDA;
  638. // No need to generate ctors/dtors if there is no GPU binary.
  639. StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
  640. if (CudaGpuBinaryFileName.empty() && !IsHIP)
  641. return nullptr;
  642. if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
  643. DeviceVars.empty())
  644. return nullptr;
  645. // void __{cuda|hip}_register_globals(void* handle);
  646. llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
  647. // We always need a function to pass in as callback. Create a dummy
  648. // implementation if we don't need to register anything.
  649. if (RelocatableDeviceCode && !RegisterGlobalsFunc)
  650. RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
  651. // void ** __{cuda|hip}RegisterFatBinary(void *);
  652. llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
  653. llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
  654. addUnderscoredPrefixToName("RegisterFatBinary"));
  655. // struct { int magic, int version, void * gpu_binary, void * dont_care };
  656. llvm::StructType *FatbinWrapperTy =
  657. llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
  658. // Register GPU binary with the CUDA runtime, store returned handle in a
  659. // global variable and save a reference in GpuBinaryHandle to be cleaned up
  660. // in destructor on exit. Then associate all known kernels with the GPU binary
  661. // handle so CUDA runtime can figure out what to call on the GPU side.
  662. std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
  663. if (!CudaGpuBinaryFileName.empty()) {
  664. llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
  665. llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
  666. if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
  667. CGM.getDiags().Report(diag::err_cannot_open_file)
  668. << CudaGpuBinaryFileName << EC.message();
  669. return nullptr;
  670. }
  671. CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
  672. }
  673. llvm::Function *ModuleCtorFunc = llvm::Function::Create(
  674. llvm::FunctionType::get(VoidTy, false),
  675. llvm::GlobalValue::InternalLinkage,
  676. addUnderscoredPrefixToName("_module_ctor"), &TheModule);
  677. llvm::BasicBlock *CtorEntryBB =
  678. llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
  679. CGBuilderTy CtorBuilder(CGM, Context);
  680. CtorBuilder.SetInsertPoint(CtorEntryBB);
  681. const char *FatbinConstantName;
  682. const char *FatbinSectionName;
  683. const char *ModuleIDSectionName;
  684. StringRef ModuleIDPrefix;
  685. llvm::Constant *FatBinStr;
  686. unsigned FatMagic;
  687. if (IsHIP) {
  688. FatbinConstantName = ".hip_fatbin";
  689. FatbinSectionName = ".hipFatBinSegment";
  690. ModuleIDSectionName = "__hip_module_id";
  691. ModuleIDPrefix = "__hip_";
  692. if (CudaGpuBinary) {
  693. // If fatbin is available from early finalization, create a string
  694. // literal containing the fat binary loaded from the given file.
  695. const unsigned HIPCodeObjectAlign = 4096;
  696. FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
  697. FatbinConstantName, HIPCodeObjectAlign);
  698. } else {
  699. // If fatbin is not available, create an external symbol
  700. // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
  701. // to contain the fat binary but will be populated somewhere else,
  702. // e.g. by lld through link script.
  703. FatBinStr = new llvm::GlobalVariable(
  704. CGM.getModule(), CGM.Int8Ty,
  705. /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
  706. "__hip_fatbin", nullptr,
  707. llvm::GlobalVariable::NotThreadLocal);
  708. cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
  709. }
  710. FatMagic = HIPFatMagic;
  711. } else {
  712. if (RelocatableDeviceCode)
  713. FatbinConstantName = CGM.getTriple().isMacOSX()
  714. ? "__NV_CUDA,__nv_relfatbin"
  715. : "__nv_relfatbin";
  716. else
  717. FatbinConstantName =
  718. CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
  719. // NVIDIA's cuobjdump looks for fatbins in this section.
  720. FatbinSectionName =
  721. CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
  722. ModuleIDSectionName = CGM.getTriple().isMacOSX()
  723. ? "__NV_CUDA,__nv_module_id"
  724. : "__nv_module_id";
  725. ModuleIDPrefix = "__nv_";
  726. // For CUDA, create a string literal containing the fat binary loaded from
  727. // the given file.
  728. FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
  729. FatbinConstantName, 8);
  730. FatMagic = CudaFatMagic;
  731. }
  732. // Create initialized wrapper structure that points to the loaded GPU binary
  733. ConstantInitBuilder Builder(CGM);
  734. auto Values = Builder.beginStruct(FatbinWrapperTy);
  735. // Fatbin wrapper magic.
  736. Values.addInt(IntTy, FatMagic);
  737. // Fatbin version.
  738. Values.addInt(IntTy, 1);
  739. // Data.
  740. Values.add(FatBinStr);
  741. // Unused in fatbin v1.
  742. Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
  743. llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
  744. addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
  745. /*constant*/ true);
  746. FatbinWrapper->setSection(FatbinSectionName);
  747. // There is only one HIP fat binary per linked module, however there are
  748. // multiple constructor functions. Make sure the fat binary is registered
  749. // only once. The constructor functions are executed by the dynamic loader
  750. // before the program gains control. The dynamic loader cannot execute the
  751. // constructor functions concurrently since doing that would not guarantee
  752. // thread safety of the loaded program. Therefore we can assume sequential
  753. // execution of constructor functions here.
  754. if (IsHIP) {
  755. auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
  756. llvm::GlobalValue::LinkOnceAnyLinkage;
  757. llvm::BasicBlock *IfBlock =
  758. llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
  759. llvm::BasicBlock *ExitBlock =
  760. llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
  761. // The name, size, and initialization pattern of this variable is part
  762. // of HIP ABI.
  763. GpuBinaryHandle = new llvm::GlobalVariable(
  764. TheModule, VoidPtrPtrTy, /*isConstant=*/false,
  765. Linkage,
  766. /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
  767. "__hip_gpubin_handle");
  768. if (Linkage == llvm::GlobalValue::LinkOnceAnyLinkage)
  769. GpuBinaryHandle->setComdat(
  770. CGM.getModule().getOrInsertComdat(GpuBinaryHandle->getName()));
  771. GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
  772. // Prevent the weak symbol in different shared libraries being merged.
  773. if (Linkage != llvm::GlobalValue::InternalLinkage)
  774. GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
  775. Address GpuBinaryAddr(
  776. GpuBinaryHandle, VoidPtrPtrTy,
  777. CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
  778. {
  779. auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
  780. llvm::Constant *Zero =
  781. llvm::Constant::getNullValue(HandleValue->getType());
  782. llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
  783. CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
  784. }
  785. {
  786. CtorBuilder.SetInsertPoint(IfBlock);
  787. // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
  788. llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
  789. RegisterFatbinFunc,
  790. CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
  791. CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
  792. CtorBuilder.CreateBr(ExitBlock);
  793. }
  794. {
  795. CtorBuilder.SetInsertPoint(ExitBlock);
  796. // Call __hip_register_globals(GpuBinaryHandle);
  797. if (RegisterGlobalsFunc) {
  798. auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
  799. CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
  800. }
  801. }
  802. } else if (!RelocatableDeviceCode) {
  803. // Register binary with CUDA runtime. This is substantially different in
  804. // default mode vs. separate compilation!
  805. // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
  806. llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
  807. RegisterFatbinFunc,
  808. CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
  809. GpuBinaryHandle = new llvm::GlobalVariable(
  810. TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
  811. llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
  812. GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
  813. CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
  814. CGM.getPointerAlign());
  815. // Call __cuda_register_globals(GpuBinaryHandle);
  816. if (RegisterGlobalsFunc)
  817. CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
  818. // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
  819. if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
  820. CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
  821. // void __cudaRegisterFatBinaryEnd(void **);
  822. llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
  823. llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
  824. "__cudaRegisterFatBinaryEnd");
  825. CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
  826. }
  827. } else {
  828. // Generate a unique module ID.
  829. SmallString<64> ModuleID;
  830. llvm::raw_svector_ostream OS(ModuleID);
  831. OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
  832. llvm::Constant *ModuleIDConstant = makeConstantArray(
  833. std::string(ModuleID.str()), "", ModuleIDSectionName, 32, /*AddNull=*/true);
  834. // Create an alias for the FatbinWrapper that nvcc will look for.
  835. llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
  836. Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
  837. // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
  838. // void *, void (*)(void **))
  839. SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
  840. RegisterLinkedBinaryName += ModuleID;
  841. llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
  842. getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
  843. assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
  844. llvm::Value *Args[] = {RegisterGlobalsFunc,
  845. CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
  846. ModuleIDConstant,
  847. makeDummyFunction(getCallbackFnTy())};
  848. CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
  849. }
  850. // Create destructor and register it with atexit() the way NVCC does it. Doing
  851. // it during regular destructor phase worked in CUDA before 9.2 but results in
  852. // double-free in 9.2.
  853. if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
  854. // extern "C" int atexit(void (*f)(void));
  855. llvm::FunctionType *AtExitTy =
  856. llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
  857. llvm::FunctionCallee AtExitFunc =
  858. CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
  859. /*Local=*/true);
  860. CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
  861. }
  862. CtorBuilder.CreateRetVoid();
  863. return ModuleCtorFunc;
  864. }
  865. /// Creates a global destructor function that unregisters the GPU code blob
  866. /// registered by constructor.
  867. ///
  868. /// For CUDA:
  869. /// \code
  870. /// void __cuda_module_dtor() {
  871. /// __cudaUnregisterFatBinary(Handle);
  872. /// }
  873. /// \endcode
  874. ///
  875. /// For HIP:
  876. /// \code
  877. /// void __hip_module_dtor() {
  878. /// if (__hip_gpubin_handle) {
  879. /// __hipUnregisterFatBinary(__hip_gpubin_handle);
  880. /// __hip_gpubin_handle = 0;
  881. /// }
  882. /// }
  883. /// \endcode
  884. llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
  885. // No need for destructor if we don't have a handle to unregister.
  886. if (!GpuBinaryHandle)
  887. return nullptr;
  888. // void __cudaUnregisterFatBinary(void ** handle);
  889. llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
  890. llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
  891. addUnderscoredPrefixToName("UnregisterFatBinary"));
  892. llvm::Function *ModuleDtorFunc = llvm::Function::Create(
  893. llvm::FunctionType::get(VoidTy, false),
  894. llvm::GlobalValue::InternalLinkage,
  895. addUnderscoredPrefixToName("_module_dtor"), &TheModule);
  896. llvm::BasicBlock *DtorEntryBB =
  897. llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
  898. CGBuilderTy DtorBuilder(CGM, Context);
  899. DtorBuilder.SetInsertPoint(DtorEntryBB);
  900. Address GpuBinaryAddr(
  901. GpuBinaryHandle, GpuBinaryHandle->getValueType(),
  902. CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
  903. auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
  904. // There is only one HIP fat binary per linked module, however there are
  905. // multiple destructor functions. Make sure the fat binary is unregistered
  906. // only once.
  907. if (CGM.getLangOpts().HIP) {
  908. llvm::BasicBlock *IfBlock =
  909. llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
  910. llvm::BasicBlock *ExitBlock =
  911. llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
  912. llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
  913. llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
  914. DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
  915. DtorBuilder.SetInsertPoint(IfBlock);
  916. DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
  917. DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
  918. DtorBuilder.CreateBr(ExitBlock);
  919. DtorBuilder.SetInsertPoint(ExitBlock);
  920. } else {
  921. DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
  922. }
  923. DtorBuilder.CreateRetVoid();
  924. return ModuleDtorFunc;
  925. }
  926. CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
  927. return new CGNVCUDARuntime(CGM);
  928. }
  929. void CGNVCUDARuntime::internalizeDeviceSideVar(
  930. const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
  931. // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
  932. // global variables become internal definitions. These have to be internal in
  933. // order to prevent name conflicts with global host variables with the same
  934. // name in a different TUs.
  935. //
  936. // For -fgpu-rdc, the shadow variables should not be internalized because
  937. // they may be accessed by different TU.
  938. if (CGM.getLangOpts().GPURelocatableDeviceCode)
  939. return;
  940. // __shared__ variables are odd. Shadows do get created, but
  941. // they are not registered with the CUDA runtime, so they
  942. // can't really be used to access their device-side
  943. // counterparts. It's not clear yet whether it's nvcc's bug or
  944. // a feature, but we've got to do the same for compatibility.
  945. if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
  946. D->hasAttr<CUDASharedAttr>() ||
  947. D->getType()->isCUDADeviceBuiltinSurfaceType() ||
  948. D->getType()->isCUDADeviceBuiltinTextureType()) {
  949. Linkage = llvm::GlobalValue::InternalLinkage;
  950. }
  951. }
  952. void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
  953. llvm::GlobalVariable &GV) {
  954. if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
  955. // Shadow variables and their properties must be registered with CUDA
  956. // runtime. Skip Extern global variables, which will be registered in
  957. // the TU where they are defined.
  958. //
  959. // Don't register a C++17 inline variable. The local symbol can be
  960. // discarded and referencing a discarded local symbol from outside the
  961. // comdat (__cuda_register_globals) is disallowed by the ELF spec.
  962. //
  963. // HIP managed variables need to be always recorded in device and host
  964. // compilations for transformation.
  965. //
  966. // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
  967. // added to llvm.compiler-used, therefore they are safe to be registered.
  968. if ((!D->hasExternalStorage() && !D->isInline()) ||
  969. CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) ||
  970. D->hasAttr<HIPManagedAttr>()) {
  971. registerDeviceVar(D, GV, !D->hasDefinition(),
  972. D->hasAttr<CUDAConstantAttr>());
  973. }
  974. } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
  975. D->getType()->isCUDADeviceBuiltinTextureType()) {
  976. // Builtin surfaces and textures and their template arguments are
  977. // also registered with CUDA runtime.
  978. const auto *TD = cast<ClassTemplateSpecializationDecl>(
  979. D->getType()->castAs<RecordType>()->getDecl());
  980. const TemplateArgumentList &Args = TD->getTemplateArgs();
  981. if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
  982. assert(Args.size() == 2 &&
  983. "Unexpected number of template arguments of CUDA device "
  984. "builtin surface type.");
  985. auto SurfType = Args[1].getAsIntegral();
  986. if (!D->hasExternalStorage())
  987. registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
  988. } else {
  989. assert(Args.size() == 3 &&
  990. "Unexpected number of template arguments of CUDA device "
  991. "builtin texture type.");
  992. auto TexType = Args[1].getAsIntegral();
  993. auto Normalized = Args[2].getAsIntegral();
  994. if (!D->hasExternalStorage())
  995. registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
  996. Normalized.getZExtValue());
  997. }
  998. }
  999. }
  1000. // Transform managed variables to pointers to managed variables in device code.
  1001. // Each use of the original managed variable is replaced by a load from the
  1002. // transformed managed variable. The transformed managed variable contains
  1003. // the address of managed memory which will be allocated by the runtime.
  1004. void CGNVCUDARuntime::transformManagedVars() {
  1005. for (auto &&Info : DeviceVars) {
  1006. llvm::GlobalVariable *Var = Info.Var;
  1007. if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
  1008. Info.Flags.isManaged()) {
  1009. auto *ManagedVar = new llvm::GlobalVariable(
  1010. CGM.getModule(), Var->getType(),
  1011. /*isConstant=*/false, Var->getLinkage(),
  1012. /*Init=*/Var->isDeclaration()
  1013. ? nullptr
  1014. : llvm::ConstantPointerNull::get(Var->getType()),
  1015. /*Name=*/"", /*InsertBefore=*/nullptr,
  1016. llvm::GlobalVariable::NotThreadLocal,
  1017. CGM.getContext().getTargetAddressSpace(LangAS::cuda_device));
  1018. ManagedVar->setDSOLocal(Var->isDSOLocal());
  1019. ManagedVar->setVisibility(Var->getVisibility());
  1020. ManagedVar->setExternallyInitialized(true);
  1021. replaceManagedVar(Var, ManagedVar);
  1022. ManagedVar->takeName(Var);
  1023. Var->setName(Twine(ManagedVar->getName()) + ".managed");
  1024. // Keep managed variables even if they are not used in device code since
  1025. // they need to be allocated by the runtime.
  1026. if (!Var->isDeclaration()) {
  1027. assert(!ManagedVar->isDeclaration());
  1028. CGM.addCompilerUsedGlobal(Var);
  1029. CGM.addCompilerUsedGlobal(ManagedVar);
  1030. }
  1031. }
  1032. }
  1033. }
  1034. // Creates offloading entries for all the kernels and globals that must be
  1035. // registered. The linker will provide a pointer to this section so we can
  1036. // register the symbols with the linked device image.
  1037. void CGNVCUDARuntime::createOffloadingEntries() {
  1038. llvm::OpenMPIRBuilder OMPBuilder(CGM.getModule());
  1039. OMPBuilder.initialize();
  1040. StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
  1041. : "cuda_offloading_entries";
  1042. for (KernelInfo &I : EmittedKernels)
  1043. OMPBuilder.emitOffloadingEntry(KernelHandles[I.Kernel->getName()],
  1044. getDeviceSideName(cast<NamedDecl>(I.D)), 0,
  1045. DeviceVarFlags::OffloadGlobalEntry, Section);
  1046. for (VarInfo &I : DeviceVars) {
  1047. uint64_t VarSize =
  1048. CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType());
  1049. if (I.Flags.getKind() == DeviceVarFlags::Variable) {
  1050. OMPBuilder.emitOffloadingEntry(
  1051. I.Var, getDeviceSideName(I.D), VarSize,
  1052. I.Flags.isManaged() ? DeviceVarFlags::OffloadGlobalManagedEntry
  1053. : DeviceVarFlags::OffloadGlobalEntry,
  1054. Section);
  1055. } else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
  1056. OMPBuilder.emitOffloadingEntry(I.Var, getDeviceSideName(I.D), VarSize,
  1057. DeviceVarFlags::OffloadGlobalSurfaceEntry,
  1058. Section);
  1059. } else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
  1060. OMPBuilder.emitOffloadingEntry(I.Var, getDeviceSideName(I.D), VarSize,
  1061. DeviceVarFlags::OffloadGlobalTextureEntry,
  1062. Section);
  1063. }
  1064. }
  1065. }
  1066. // Returns module constructor to be added.
  1067. llvm::Function *CGNVCUDARuntime::finalizeModule() {
  1068. if (CGM.getLangOpts().CUDAIsDevice) {
  1069. transformManagedVars();
  1070. // Mark ODR-used device variables as compiler used to prevent it from being
  1071. // eliminated by optimization. This is necessary for device variables
  1072. // ODR-used by host functions. Sema correctly marks them as ODR-used no
  1073. // matter whether they are ODR-used by device or host functions.
  1074. //
  1075. // We do not need to do this if the variable has used attribute since it
  1076. // has already been added.
  1077. //
  1078. // Static device variables have been externalized at this point, therefore
  1079. // variables with LLVM private or internal linkage need not be added.
  1080. for (auto &&Info : DeviceVars) {
  1081. auto Kind = Info.Flags.getKind();
  1082. if (!Info.Var->isDeclaration() &&
  1083. !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
  1084. (Kind == DeviceVarFlags::Variable ||
  1085. Kind == DeviceVarFlags::Surface ||
  1086. Kind == DeviceVarFlags::Texture) &&
  1087. Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
  1088. CGM.addCompilerUsedGlobal(Info.Var);
  1089. }
  1090. }
  1091. return nullptr;
  1092. }
  1093. if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
  1094. createOffloadingEntries();
  1095. else
  1096. return makeModuleCtorFunction();
  1097. return nullptr;
  1098. }
  1099. llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
  1100. GlobalDecl GD) {
  1101. auto Loc = KernelHandles.find(F->getName());
  1102. if (Loc != KernelHandles.end())
  1103. return Loc->second;
  1104. if (!CGM.getLangOpts().HIP) {
  1105. KernelHandles[F->getName()] = F;
  1106. KernelStubs[F] = F;
  1107. return F;
  1108. }
  1109. auto *Var = new llvm::GlobalVariable(
  1110. TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
  1111. /*Initializer=*/nullptr,
  1112. CGM.getMangledName(
  1113. GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
  1114. Var->setAlignment(CGM.getPointerAlign().getAsAlign());
  1115. Var->setDSOLocal(F->isDSOLocal());
  1116. Var->setVisibility(F->getVisibility());
  1117. CGM.maybeSetTrivialComdat(*GD.getDecl(), *Var);
  1118. KernelHandles[F->getName()] = Var;
  1119. KernelStubs[Var] = F;
  1120. return Var;
  1121. }