CGCUDANV.cpp 48 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157
  1. //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This provides a class for CUDA code generation targeting the NVIDIA CUDA
  10. // runtime library.
  11. //
  12. //===----------------------------------------------------------------------===//
  13. #include "CGCUDARuntime.h"
  14. #include "CGCXXABI.h"
  15. #include "CodeGenFunction.h"
  16. #include "CodeGenModule.h"
  17. #include "clang/AST/Decl.h"
  18. #include "clang/Basic/Cuda.h"
  19. #include "clang/CodeGen/CodeGenABITypes.h"
  20. #include "clang/CodeGen/ConstantInitBuilder.h"
  21. #include "llvm/IR/BasicBlock.h"
  22. #include "llvm/IR/Constants.h"
  23. #include "llvm/IR/DerivedTypes.h"
  24. #include "llvm/IR/ReplaceConstant.h"
  25. #include "llvm/Support/Format.h"
  26. using namespace clang;
  27. using namespace CodeGen;
  28. namespace {
  29. constexpr unsigned CudaFatMagic = 0x466243b1;
  30. constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
  31. class CGNVCUDARuntime : public CGCUDARuntime {
  32. private:
  33. llvm::IntegerType *IntTy, *SizeTy;
  34. llvm::Type *VoidTy;
  35. llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
  36. /// Convenience reference to LLVM Context
  37. llvm::LLVMContext &Context;
  38. /// Convenience reference to the current module
  39. llvm::Module &TheModule;
  40. /// Keeps track of kernel launch stubs and handles emitted in this module
  41. struct KernelInfo {
  42. llvm::Function *Kernel; // stub function to help launch kernel
  43. const Decl *D;
  44. };
  45. llvm::SmallVector<KernelInfo, 16> EmittedKernels;
  46. // Map a device stub function to a symbol for identifying kernel in host code.
  47. // For CUDA, the symbol for identifying the kernel is the same as the device
  48. // stub function. For HIP, they are different.
  49. llvm::DenseMap<llvm::Function *, llvm::GlobalValue *> KernelHandles;
  50. // Map a kernel handle to the kernel stub.
  51. llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
  52. struct VarInfo {
  53. llvm::GlobalVariable *Var;
  54. const VarDecl *D;
  55. DeviceVarFlags Flags;
  56. };
  57. llvm::SmallVector<VarInfo, 16> DeviceVars;
  58. /// Keeps track of variable containing handle of GPU binary. Populated by
  59. /// ModuleCtorFunction() and used to create corresponding cleanup calls in
  60. /// ModuleDtorFunction()
  61. llvm::GlobalVariable *GpuBinaryHandle = nullptr;
  62. /// Whether we generate relocatable device code.
  63. bool RelocatableDeviceCode;
  64. /// Mangle context for device.
  65. std::unique_ptr<MangleContext> DeviceMC;
  66. llvm::FunctionCallee getSetupArgumentFn() const;
  67. llvm::FunctionCallee getLaunchFn() const;
  68. llvm::FunctionType *getRegisterGlobalsFnTy() const;
  69. llvm::FunctionType *getCallbackFnTy() const;
  70. llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
  71. std::string addPrefixToName(StringRef FuncName) const;
  72. std::string addUnderscoredPrefixToName(StringRef FuncName) const;
  73. /// Creates a function to register all kernel stubs generated in this module.
  74. llvm::Function *makeRegisterGlobalsFn();
  75. /// Helper function that generates a constant string and returns a pointer to
  76. /// the start of the string. The result of this function can be used anywhere
  77. /// where the C code specifies const char*.
  78. llvm::Constant *makeConstantString(const std::string &Str,
  79. const std::string &Name = "",
  80. const std::string &SectionName = "",
  81. unsigned Alignment = 0) {
  82. llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
  83. llvm::ConstantInt::get(SizeTy, 0)};
  84. auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
  85. llvm::GlobalVariable *GV =
  86. cast<llvm::GlobalVariable>(ConstStr.getPointer());
  87. if (!SectionName.empty()) {
  88. GV->setSection(SectionName);
  89. // Mark the address as used which make sure that this section isn't
  90. // merged and we will really have it in the object file.
  91. GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
  92. }
  93. if (Alignment)
  94. GV->setAlignment(llvm::Align(Alignment));
  95. return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
  96. ConstStr.getPointer(), Zeros);
  97. }
  98. /// Helper function that generates an empty dummy function returning void.
  99. llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
  100. assert(FnTy->getReturnType()->isVoidTy() &&
  101. "Can only generate dummy functions returning void!");
  102. llvm::Function *DummyFunc = llvm::Function::Create(
  103. FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
  104. llvm::BasicBlock *DummyBlock =
  105. llvm::BasicBlock::Create(Context, "", DummyFunc);
  106. CGBuilderTy FuncBuilder(CGM, Context);
  107. FuncBuilder.SetInsertPoint(DummyBlock);
  108. FuncBuilder.CreateRetVoid();
  109. return DummyFunc;
  110. }
  111. void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
  112. void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
  113. std::string getDeviceSideName(const NamedDecl *ND) override;
  114. void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
  115. bool Extern, bool Constant) {
  116. DeviceVars.push_back({&Var,
  117. VD,
  118. {DeviceVarFlags::Variable, Extern, Constant,
  119. VD->hasAttr<HIPManagedAttr>(),
  120. /*Normalized*/ false, 0}});
  121. }
  122. void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
  123. bool Extern, int Type) {
  124. DeviceVars.push_back({&Var,
  125. VD,
  126. {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
  127. /*Managed*/ false,
  128. /*Normalized*/ false, Type}});
  129. }
  130. void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
  131. bool Extern, int Type, bool Normalized) {
  132. DeviceVars.push_back({&Var,
  133. VD,
  134. {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
  135. /*Managed*/ false, Normalized, Type}});
  136. }
  137. /// Creates module constructor function
  138. llvm::Function *makeModuleCtorFunction();
  139. /// Creates module destructor function
  140. llvm::Function *makeModuleDtorFunction();
  141. /// Transform managed variables for device compilation.
  142. void transformManagedVars();
  143. public:
  144. CGNVCUDARuntime(CodeGenModule &CGM);
  145. llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
  146. llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
  147. auto Loc = KernelStubs.find(Handle);
  148. assert(Loc != KernelStubs.end());
  149. return Loc->second;
  150. }
  151. void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
  152. void handleVarRegistration(const VarDecl *VD,
  153. llvm::GlobalVariable &Var) override;
  154. void
  155. internalizeDeviceSideVar(const VarDecl *D,
  156. llvm::GlobalValue::LinkageTypes &Linkage) override;
  157. llvm::Function *finalizeModule() override;
  158. };
  159. } // end anonymous namespace
  160. std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
  161. if (CGM.getLangOpts().HIP)
  162. return ((Twine("hip") + Twine(FuncName)).str());
  163. return ((Twine("cuda") + Twine(FuncName)).str());
  164. }
  165. std::string
  166. CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
  167. if (CGM.getLangOpts().HIP)
  168. return ((Twine("__hip") + Twine(FuncName)).str());
  169. return ((Twine("__cuda") + Twine(FuncName)).str());
  170. }
  171. static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
  172. // If the host and device have different C++ ABIs, mark it as the device
  173. // mangle context so that the mangling needs to retrieve the additional
  174. // device lambda mangling number instead of the regular host one.
  175. if (CGM.getContext().getAuxTargetInfo() &&
  176. CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
  177. CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) {
  178. return std::unique_ptr<MangleContext>(
  179. CGM.getContext().createDeviceMangleContext(
  180. *CGM.getContext().getAuxTargetInfo()));
  181. }
  182. return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
  183. CGM.getContext().getAuxTargetInfo()));
  184. }
  185. CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
  186. : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
  187. TheModule(CGM.getModule()),
  188. RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
  189. DeviceMC(InitDeviceMC(CGM)) {
  190. CodeGen::CodeGenTypes &Types = CGM.getTypes();
  191. ASTContext &Ctx = CGM.getContext();
  192. IntTy = CGM.IntTy;
  193. SizeTy = CGM.SizeTy;
  194. VoidTy = CGM.VoidTy;
  195. CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
  196. VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
  197. VoidPtrPtrTy = VoidPtrTy->getPointerTo();
  198. }
  199. llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
  200. // cudaError_t cudaSetupArgument(void *, size_t, size_t)
  201. llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
  202. return CGM.CreateRuntimeFunction(
  203. llvm::FunctionType::get(IntTy, Params, false),
  204. addPrefixToName("SetupArgument"));
  205. }
  206. llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
  207. if (CGM.getLangOpts().HIP) {
  208. // hipError_t hipLaunchByPtr(char *);
  209. return CGM.CreateRuntimeFunction(
  210. llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
  211. }
  212. // cudaError_t cudaLaunch(char *);
  213. return CGM.CreateRuntimeFunction(
  214. llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
  215. }
  216. llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
  217. return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
  218. }
  219. llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
  220. return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
  221. }
  222. llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
  223. auto *CallbackFnTy = getCallbackFnTy();
  224. auto *RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
  225. llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
  226. VoidPtrTy, CallbackFnTy->getPointerTo()};
  227. return llvm::FunctionType::get(VoidTy, Params, false);
  228. }
  229. std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
  230. GlobalDecl GD;
  231. // D could be either a kernel or a variable.
  232. if (auto *FD = dyn_cast<FunctionDecl>(ND))
  233. GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
  234. else
  235. GD = GlobalDecl(ND);
  236. std::string DeviceSideName;
  237. MangleContext *MC;
  238. if (CGM.getLangOpts().CUDAIsDevice)
  239. MC = &CGM.getCXXABI().getMangleContext();
  240. else
  241. MC = DeviceMC.get();
  242. if (MC->shouldMangleDeclName(ND)) {
  243. SmallString<256> Buffer;
  244. llvm::raw_svector_ostream Out(Buffer);
  245. MC->mangleName(GD, Out);
  246. DeviceSideName = std::string(Out.str());
  247. } else
  248. DeviceSideName = std::string(ND->getIdentifier()->getName());
  249. // Make unique name for device side static file-scope variable for HIP.
  250. if (CGM.getContext().shouldExternalizeStaticVar(ND) &&
  251. CGM.getLangOpts().GPURelocatableDeviceCode &&
  252. !CGM.getLangOpts().CUID.empty()) {
  253. SmallString<256> Buffer;
  254. llvm::raw_svector_ostream Out(Buffer);
  255. Out << DeviceSideName;
  256. CGM.printPostfixForExternalizedDecl(Out, ND);
  257. DeviceSideName = std::string(Out.str());
  258. }
  259. return DeviceSideName;
  260. }
  261. void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
  262. FunctionArgList &Args) {
  263. EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
  264. if (auto *GV = dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn])) {
  265. GV->setLinkage(CGF.CurFn->getLinkage());
  266. GV->setInitializer(CGF.CurFn);
  267. }
  268. if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
  269. CudaFeature::CUDA_USES_NEW_LAUNCH) ||
  270. (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
  271. emitDeviceStubBodyNew(CGF, Args);
  272. else
  273. emitDeviceStubBodyLegacy(CGF, Args);
  274. }
  275. // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
  276. // array and kernels are launched using cudaLaunchKernel().
  277. void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
  278. FunctionArgList &Args) {
  279. // Build the shadow stack entry at the very start of the function.
  280. // Calculate amount of space we will need for all arguments. If we have no
  281. // args, allocate a single pointer so we still have a valid pointer to the
  282. // argument array that we can pass to runtime, even if it will be unused.
  283. Address KernelArgs = CGF.CreateTempAlloca(
  284. VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
  285. llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
  286. // Store pointers to the arguments in a locally allocated launch_args.
  287. for (unsigned i = 0; i < Args.size(); ++i) {
  288. llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
  289. llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
  290. CGF.Builder.CreateDefaultAlignedStore(
  291. VoidVarPtr,
  292. CGF.Builder.CreateConstGEP1_32(VoidPtrTy, KernelArgs.getPointer(), i));
  293. }
  294. llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
  295. // Lookup cudaLaunchKernel/hipLaunchKernel function.
  296. // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
  297. // void **args, size_t sharedMem,
  298. // cudaStream_t stream);
  299. // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
  300. // void **args, size_t sharedMem,
  301. // hipStream_t stream);
  302. TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
  303. DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
  304. auto LaunchKernelName = addPrefixToName("LaunchKernel");
  305. IdentifierInfo &cudaLaunchKernelII =
  306. CGM.getContext().Idents.get(LaunchKernelName);
  307. FunctionDecl *cudaLaunchKernelFD = nullptr;
  308. for (auto *Result : DC->lookup(&cudaLaunchKernelII)) {
  309. if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
  310. cudaLaunchKernelFD = FD;
  311. }
  312. if (cudaLaunchKernelFD == nullptr) {
  313. CGM.Error(CGF.CurFuncDecl->getLocation(),
  314. "Can't find declaration for " + LaunchKernelName);
  315. return;
  316. }
  317. // Create temporary dim3 grid_dim, block_dim.
  318. ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
  319. QualType Dim3Ty = GridDimParam->getType();
  320. Address GridDim =
  321. CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
  322. Address BlockDim =
  323. CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
  324. Address ShmemSize =
  325. CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
  326. Address Stream =
  327. CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
  328. llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
  329. llvm::FunctionType::get(IntTy,
  330. {/*gridDim=*/GridDim.getType(),
  331. /*blockDim=*/BlockDim.getType(),
  332. /*ShmemSize=*/ShmemSize.getType(),
  333. /*Stream=*/Stream.getType()},
  334. /*isVarArg=*/false),
  335. addUnderscoredPrefixToName("PopCallConfiguration"));
  336. CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
  337. {GridDim.getPointer(), BlockDim.getPointer(),
  338. ShmemSize.getPointer(), Stream.getPointer()});
  339. // Emit the call to cudaLaunch
  340. llvm::Value *Kernel =
  341. CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], VoidPtrTy);
  342. CallArgList LaunchKernelArgs;
  343. LaunchKernelArgs.add(RValue::get(Kernel),
  344. cudaLaunchKernelFD->getParamDecl(0)->getType());
  345. LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
  346. LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
  347. LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
  348. cudaLaunchKernelFD->getParamDecl(3)->getType());
  349. LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
  350. cudaLaunchKernelFD->getParamDecl(4)->getType());
  351. LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
  352. cudaLaunchKernelFD->getParamDecl(5)->getType());
  353. QualType QT = cudaLaunchKernelFD->getType();
  354. QualType CQT = QT.getCanonicalType();
  355. llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
  356. llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
  357. const CGFunctionInfo &FI =
  358. CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
  359. llvm::FunctionCallee cudaLaunchKernelFn =
  360. CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
  361. CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
  362. LaunchKernelArgs);
  363. CGF.EmitBranch(EndBlock);
  364. CGF.EmitBlock(EndBlock);
  365. }
  366. void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
  367. FunctionArgList &Args) {
  368. // Emit a call to cudaSetupArgument for each arg in Args.
  369. llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
  370. llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
  371. CharUnits Offset = CharUnits::Zero();
  372. for (const VarDecl *A : Args) {
  373. auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
  374. Offset = Offset.alignTo(TInfo.Align);
  375. llvm::Value *Args[] = {
  376. CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
  377. VoidPtrTy),
  378. llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
  379. llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
  380. };
  381. llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
  382. llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
  383. llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
  384. llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
  385. CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
  386. CGF.EmitBlock(NextBlock);
  387. Offset += TInfo.Width;
  388. }
  389. // Emit the call to cudaLaunch
  390. llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
  391. llvm::Value *Arg =
  392. CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], CharPtrTy);
  393. CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
  394. CGF.EmitBranch(EndBlock);
  395. CGF.EmitBlock(EndBlock);
  396. }
  397. // Replace the original variable Var with the address loaded from variable
  398. // ManagedVar populated by HIP runtime.
  399. static void replaceManagedVar(llvm::GlobalVariable *Var,
  400. llvm::GlobalVariable *ManagedVar) {
  401. SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
  402. for (auto &&VarUse : Var->uses()) {
  403. WorkList.push_back({VarUse.getUser()});
  404. }
  405. while (!WorkList.empty()) {
  406. auto &&WorkItem = WorkList.pop_back_val();
  407. auto *U = WorkItem.back();
  408. if (isa<llvm::ConstantExpr>(U)) {
  409. for (auto &&UU : U->uses()) {
  410. WorkItem.push_back(UU.getUser());
  411. WorkList.push_back(WorkItem);
  412. WorkItem.pop_back();
  413. }
  414. continue;
  415. }
  416. if (auto *I = dyn_cast<llvm::Instruction>(U)) {
  417. llvm::Value *OldV = Var;
  418. llvm::Instruction *NewV =
  419. new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
  420. llvm::Align(Var->getAlignment()), I);
  421. WorkItem.pop_back();
  422. // Replace constant expressions directly or indirectly using the managed
  423. // variable with instructions.
  424. for (auto &&Op : WorkItem) {
  425. auto *CE = cast<llvm::ConstantExpr>(Op);
  426. auto *NewInst = CE->getAsInstruction(I);
  427. NewInst->replaceUsesOfWith(OldV, NewV);
  428. OldV = CE;
  429. NewV = NewInst;
  430. }
  431. I->replaceUsesOfWith(OldV, NewV);
  432. } else {
  433. llvm_unreachable("Invalid use of managed variable");
  434. }
  435. }
  436. }
  437. /// Creates a function that sets up state on the host side for CUDA objects that
  438. /// have a presence on both the host and device sides. Specifically, registers
  439. /// the host side of kernel functions and device global variables with the CUDA
  440. /// runtime.
  441. /// \code
  442. /// void __cuda_register_globals(void** GpuBinaryHandle) {
  443. /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
  444. /// ...
  445. /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
  446. /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
  447. /// ...
  448. /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
  449. /// }
  450. /// \endcode
  451. llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
  452. // No need to register anything
  453. if (EmittedKernels.empty() && DeviceVars.empty())
  454. return nullptr;
  455. llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
  456. getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
  457. addUnderscoredPrefixToName("_register_globals"), &TheModule);
  458. llvm::BasicBlock *EntryBB =
  459. llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
  460. CGBuilderTy Builder(CGM, Context);
  461. Builder.SetInsertPoint(EntryBB);
  462. // void __cudaRegisterFunction(void **, const char *, char *, const char *,
  463. // int, uint3*, uint3*, dim3*, dim3*, int*)
  464. llvm::Type *RegisterFuncParams[] = {
  465. VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
  466. VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
  467. llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
  468. llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
  469. addUnderscoredPrefixToName("RegisterFunction"));
  470. // Extract GpuBinaryHandle passed as the first argument passed to
  471. // __cuda_register_globals() and generate __cudaRegisterFunction() call for
  472. // each emitted kernel.
  473. llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
  474. for (auto &&I : EmittedKernels) {
  475. llvm::Constant *KernelName =
  476. makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
  477. llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
  478. llvm::Value *Args[] = {
  479. &GpuBinaryHandlePtr,
  480. Builder.CreateBitCast(KernelHandles[I.Kernel], VoidPtrTy),
  481. KernelName,
  482. KernelName,
  483. llvm::ConstantInt::get(IntTy, -1),
  484. NullPtr,
  485. NullPtr,
  486. NullPtr,
  487. NullPtr,
  488. llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
  489. Builder.CreateCall(RegisterFunc, Args);
  490. }
  491. llvm::Type *VarSizeTy = IntTy;
  492. // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
  493. if (CGM.getLangOpts().HIP ||
  494. ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
  495. VarSizeTy = SizeTy;
  496. // void __cudaRegisterVar(void **, char *, char *, const char *,
  497. // int, int, int, int)
  498. llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
  499. CharPtrTy, IntTy, VarSizeTy,
  500. IntTy, IntTy};
  501. llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
  502. llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
  503. addUnderscoredPrefixToName("RegisterVar"));
  504. // void __hipRegisterManagedVar(void **, char *, char *, const char *,
  505. // size_t, unsigned)
  506. llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
  507. CharPtrTy, VarSizeTy, IntTy};
  508. llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
  509. llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
  510. addUnderscoredPrefixToName("RegisterManagedVar"));
  511. // void __cudaRegisterSurface(void **, const struct surfaceReference *,
  512. // const void **, const char *, int, int);
  513. llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
  514. llvm::FunctionType::get(
  515. VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
  516. false),
  517. addUnderscoredPrefixToName("RegisterSurface"));
  518. // void __cudaRegisterTexture(void **, const struct textureReference *,
  519. // const void **, const char *, int, int, int)
  520. llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
  521. llvm::FunctionType::get(
  522. VoidTy,
  523. {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
  524. false),
  525. addUnderscoredPrefixToName("RegisterTexture"));
  526. for (auto &&Info : DeviceVars) {
  527. llvm::GlobalVariable *Var = Info.Var;
  528. assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
  529. "External variables should not show up here, except HIP managed "
  530. "variables");
  531. llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
  532. switch (Info.Flags.getKind()) {
  533. case DeviceVarFlags::Variable: {
  534. uint64_t VarSize =
  535. CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
  536. if (Info.Flags.isManaged()) {
  537. auto *ManagedVar = new llvm::GlobalVariable(
  538. CGM.getModule(), Var->getType(),
  539. /*isConstant=*/false, Var->getLinkage(),
  540. /*Init=*/Var->isDeclaration()
  541. ? nullptr
  542. : llvm::ConstantPointerNull::get(Var->getType()),
  543. /*Name=*/"", /*InsertBefore=*/nullptr,
  544. llvm::GlobalVariable::NotThreadLocal);
  545. ManagedVar->setDSOLocal(Var->isDSOLocal());
  546. ManagedVar->setVisibility(Var->getVisibility());
  547. ManagedVar->setExternallyInitialized(true);
  548. ManagedVar->takeName(Var);
  549. Var->setName(Twine(ManagedVar->getName() + ".managed"));
  550. replaceManagedVar(Var, ManagedVar);
  551. llvm::Value *Args[] = {
  552. &GpuBinaryHandlePtr,
  553. Builder.CreateBitCast(ManagedVar, VoidPtrTy),
  554. Builder.CreateBitCast(Var, VoidPtrTy),
  555. VarName,
  556. llvm::ConstantInt::get(VarSizeTy, VarSize),
  557. llvm::ConstantInt::get(IntTy, Var->getAlignment())};
  558. if (!Var->isDeclaration())
  559. Builder.CreateCall(RegisterManagedVar, Args);
  560. } else {
  561. llvm::Value *Args[] = {
  562. &GpuBinaryHandlePtr,
  563. Builder.CreateBitCast(Var, VoidPtrTy),
  564. VarName,
  565. VarName,
  566. llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
  567. llvm::ConstantInt::get(VarSizeTy, VarSize),
  568. llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
  569. llvm::ConstantInt::get(IntTy, 0)};
  570. Builder.CreateCall(RegisterVar, Args);
  571. }
  572. break;
  573. }
  574. case DeviceVarFlags::Surface:
  575. Builder.CreateCall(
  576. RegisterSurf,
  577. {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
  578. VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
  579. llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
  580. break;
  581. case DeviceVarFlags::Texture:
  582. Builder.CreateCall(
  583. RegisterTex,
  584. {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
  585. VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
  586. llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
  587. llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
  588. break;
  589. }
  590. }
  591. Builder.CreateRetVoid();
  592. return RegisterKernelsFunc;
  593. }
  594. /// Creates a global constructor function for the module:
  595. ///
  596. /// For CUDA:
  597. /// \code
  598. /// void __cuda_module_ctor(void*) {
  599. /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
  600. /// __cuda_register_globals(Handle);
  601. /// }
  602. /// \endcode
  603. ///
  604. /// For HIP:
  605. /// \code
  606. /// void __hip_module_ctor(void*) {
  607. /// if (__hip_gpubin_handle == 0) {
  608. /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
  609. /// __hip_register_globals(__hip_gpubin_handle);
  610. /// }
  611. /// }
  612. /// \endcode
  613. llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
  614. bool IsHIP = CGM.getLangOpts().HIP;
  615. bool IsCUDA = CGM.getLangOpts().CUDA;
  616. // No need to generate ctors/dtors if there is no GPU binary.
  617. StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
  618. if (CudaGpuBinaryFileName.empty() && !IsHIP)
  619. return nullptr;
  620. if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
  621. DeviceVars.empty())
  622. return nullptr;
  623. // void __{cuda|hip}_register_globals(void* handle);
  624. llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
  625. // We always need a function to pass in as callback. Create a dummy
  626. // implementation if we don't need to register anything.
  627. if (RelocatableDeviceCode && !RegisterGlobalsFunc)
  628. RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
  629. // void ** __{cuda|hip}RegisterFatBinary(void *);
  630. llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
  631. llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
  632. addUnderscoredPrefixToName("RegisterFatBinary"));
  633. // struct { int magic, int version, void * gpu_binary, void * dont_care };
  634. llvm::StructType *FatbinWrapperTy =
  635. llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
  636. // Register GPU binary with the CUDA runtime, store returned handle in a
  637. // global variable and save a reference in GpuBinaryHandle to be cleaned up
  638. // in destructor on exit. Then associate all known kernels with the GPU binary
  639. // handle so CUDA runtime can figure out what to call on the GPU side.
  640. std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
  641. if (!CudaGpuBinaryFileName.empty()) {
  642. llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
  643. llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
  644. if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
  645. CGM.getDiags().Report(diag::err_cannot_open_file)
  646. << CudaGpuBinaryFileName << EC.message();
  647. return nullptr;
  648. }
  649. CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
  650. }
  651. llvm::Function *ModuleCtorFunc = llvm::Function::Create(
  652. llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
  653. llvm::GlobalValue::InternalLinkage,
  654. addUnderscoredPrefixToName("_module_ctor"), &TheModule);
  655. llvm::BasicBlock *CtorEntryBB =
  656. llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
  657. CGBuilderTy CtorBuilder(CGM, Context);
  658. CtorBuilder.SetInsertPoint(CtorEntryBB);
  659. const char *FatbinConstantName;
  660. const char *FatbinSectionName;
  661. const char *ModuleIDSectionName;
  662. StringRef ModuleIDPrefix;
  663. llvm::Constant *FatBinStr;
  664. unsigned FatMagic;
  665. if (IsHIP) {
  666. FatbinConstantName = ".hip_fatbin";
  667. FatbinSectionName = ".hipFatBinSegment";
  668. ModuleIDSectionName = "__hip_module_id";
  669. ModuleIDPrefix = "__hip_";
  670. if (CudaGpuBinary) {
  671. // If fatbin is available from early finalization, create a string
  672. // literal containing the fat binary loaded from the given file.
  673. const unsigned HIPCodeObjectAlign = 4096;
  674. FatBinStr =
  675. makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
  676. FatbinConstantName, HIPCodeObjectAlign);
  677. } else {
  678. // If fatbin is not available, create an external symbol
  679. // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
  680. // to contain the fat binary but will be populated somewhere else,
  681. // e.g. by lld through link script.
  682. FatBinStr = new llvm::GlobalVariable(
  683. CGM.getModule(), CGM.Int8Ty,
  684. /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
  685. "__hip_fatbin", nullptr,
  686. llvm::GlobalVariable::NotThreadLocal);
  687. cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
  688. }
  689. FatMagic = HIPFatMagic;
  690. } else {
  691. if (RelocatableDeviceCode)
  692. FatbinConstantName = CGM.getTriple().isMacOSX()
  693. ? "__NV_CUDA,__nv_relfatbin"
  694. : "__nv_relfatbin";
  695. else
  696. FatbinConstantName =
  697. CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
  698. // NVIDIA's cuobjdump looks for fatbins in this section.
  699. FatbinSectionName =
  700. CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
  701. ModuleIDSectionName = CGM.getTriple().isMacOSX()
  702. ? "__NV_CUDA,__nv_module_id"
  703. : "__nv_module_id";
  704. ModuleIDPrefix = "__nv_";
  705. // For CUDA, create a string literal containing the fat binary loaded from
  706. // the given file.
  707. FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
  708. FatbinConstantName, 8);
  709. FatMagic = CudaFatMagic;
  710. }
  711. // Create initialized wrapper structure that points to the loaded GPU binary
  712. ConstantInitBuilder Builder(CGM);
  713. auto Values = Builder.beginStruct(FatbinWrapperTy);
  714. // Fatbin wrapper magic.
  715. Values.addInt(IntTy, FatMagic);
  716. // Fatbin version.
  717. Values.addInt(IntTy, 1);
  718. // Data.
  719. Values.add(FatBinStr);
  720. // Unused in fatbin v1.
  721. Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
  722. llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
  723. addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
  724. /*constant*/ true);
  725. FatbinWrapper->setSection(FatbinSectionName);
  726. // There is only one HIP fat binary per linked module, however there are
  727. // multiple constructor functions. Make sure the fat binary is registered
  728. // only once. The constructor functions are executed by the dynamic loader
  729. // before the program gains control. The dynamic loader cannot execute the
  730. // constructor functions concurrently since doing that would not guarantee
  731. // thread safety of the loaded program. Therefore we can assume sequential
  732. // execution of constructor functions here.
  733. if (IsHIP) {
  734. auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
  735. llvm::GlobalValue::LinkOnceAnyLinkage;
  736. llvm::BasicBlock *IfBlock =
  737. llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
  738. llvm::BasicBlock *ExitBlock =
  739. llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
  740. // The name, size, and initialization pattern of this variable is part
  741. // of HIP ABI.
  742. GpuBinaryHandle = new llvm::GlobalVariable(
  743. TheModule, VoidPtrPtrTy, /*isConstant=*/false,
  744. Linkage,
  745. /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
  746. "__hip_gpubin_handle");
  747. if (Linkage == llvm::GlobalValue::LinkOnceAnyLinkage)
  748. GpuBinaryHandle->setComdat(
  749. CGM.getModule().getOrInsertComdat(GpuBinaryHandle->getName()));
  750. GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
  751. // Prevent the weak symbol in different shared libraries being merged.
  752. if (Linkage != llvm::GlobalValue::InternalLinkage)
  753. GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
  754. Address GpuBinaryAddr(
  755. GpuBinaryHandle,
  756. CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
  757. {
  758. auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
  759. llvm::Constant *Zero =
  760. llvm::Constant::getNullValue(HandleValue->getType());
  761. llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
  762. CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
  763. }
  764. {
  765. CtorBuilder.SetInsertPoint(IfBlock);
  766. // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
  767. llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
  768. RegisterFatbinFunc,
  769. CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
  770. CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
  771. CtorBuilder.CreateBr(ExitBlock);
  772. }
  773. {
  774. CtorBuilder.SetInsertPoint(ExitBlock);
  775. // Call __hip_register_globals(GpuBinaryHandle);
  776. if (RegisterGlobalsFunc) {
  777. auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
  778. CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
  779. }
  780. }
  781. } else if (!RelocatableDeviceCode) {
  782. // Register binary with CUDA runtime. This is substantially different in
  783. // default mode vs. separate compilation!
  784. // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
  785. llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
  786. RegisterFatbinFunc,
  787. CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
  788. GpuBinaryHandle = new llvm::GlobalVariable(
  789. TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
  790. llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
  791. GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
  792. CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
  793. CGM.getPointerAlign());
  794. // Call __cuda_register_globals(GpuBinaryHandle);
  795. if (RegisterGlobalsFunc)
  796. CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
  797. // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
  798. if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
  799. CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
  800. // void __cudaRegisterFatBinaryEnd(void **);
  801. llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
  802. llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
  803. "__cudaRegisterFatBinaryEnd");
  804. CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
  805. }
  806. } else {
  807. // Generate a unique module ID.
  808. SmallString<64> ModuleID;
  809. llvm::raw_svector_ostream OS(ModuleID);
  810. OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
  811. llvm::Constant *ModuleIDConstant = makeConstantString(
  812. std::string(ModuleID.str()), "", ModuleIDSectionName, 32);
  813. // Create an alias for the FatbinWrapper that nvcc will look for.
  814. llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
  815. Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
  816. // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
  817. // void *, void (*)(void **))
  818. SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
  819. RegisterLinkedBinaryName += ModuleID;
  820. llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
  821. getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
  822. assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
  823. llvm::Value *Args[] = {RegisterGlobalsFunc,
  824. CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
  825. ModuleIDConstant,
  826. makeDummyFunction(getCallbackFnTy())};
  827. CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
  828. }
  829. // Create destructor and register it with atexit() the way NVCC does it. Doing
  830. // it during regular destructor phase worked in CUDA before 9.2 but results in
  831. // double-free in 9.2.
  832. if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
  833. // extern "C" int atexit(void (*f)(void));
  834. llvm::FunctionType *AtExitTy =
  835. llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
  836. llvm::FunctionCallee AtExitFunc =
  837. CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
  838. /*Local=*/true);
  839. CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
  840. }
  841. CtorBuilder.CreateRetVoid();
  842. return ModuleCtorFunc;
  843. }
  844. /// Creates a global destructor function that unregisters the GPU code blob
  845. /// registered by constructor.
  846. ///
  847. /// For CUDA:
  848. /// \code
  849. /// void __cuda_module_dtor(void*) {
  850. /// __cudaUnregisterFatBinary(Handle);
  851. /// }
  852. /// \endcode
  853. ///
  854. /// For HIP:
  855. /// \code
  856. /// void __hip_module_dtor(void*) {
  857. /// if (__hip_gpubin_handle) {
  858. /// __hipUnregisterFatBinary(__hip_gpubin_handle);
  859. /// __hip_gpubin_handle = 0;
  860. /// }
  861. /// }
  862. /// \endcode
  863. llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
  864. // No need for destructor if we don't have a handle to unregister.
  865. if (!GpuBinaryHandle)
  866. return nullptr;
  867. // void __cudaUnregisterFatBinary(void ** handle);
  868. llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
  869. llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
  870. addUnderscoredPrefixToName("UnregisterFatBinary"));
  871. llvm::Function *ModuleDtorFunc = llvm::Function::Create(
  872. llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
  873. llvm::GlobalValue::InternalLinkage,
  874. addUnderscoredPrefixToName("_module_dtor"), &TheModule);
  875. llvm::BasicBlock *DtorEntryBB =
  876. llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
  877. CGBuilderTy DtorBuilder(CGM, Context);
  878. DtorBuilder.SetInsertPoint(DtorEntryBB);
  879. Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
  880. GpuBinaryHandle->getAlignment()));
  881. auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
  882. // There is only one HIP fat binary per linked module, however there are
  883. // multiple destructor functions. Make sure the fat binary is unregistered
  884. // only once.
  885. if (CGM.getLangOpts().HIP) {
  886. llvm::BasicBlock *IfBlock =
  887. llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
  888. llvm::BasicBlock *ExitBlock =
  889. llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
  890. llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
  891. llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
  892. DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
  893. DtorBuilder.SetInsertPoint(IfBlock);
  894. DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
  895. DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
  896. DtorBuilder.CreateBr(ExitBlock);
  897. DtorBuilder.SetInsertPoint(ExitBlock);
  898. } else {
  899. DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
  900. }
  901. DtorBuilder.CreateRetVoid();
  902. return ModuleDtorFunc;
  903. }
  904. CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
  905. return new CGNVCUDARuntime(CGM);
  906. }
  907. void CGNVCUDARuntime::internalizeDeviceSideVar(
  908. const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
  909. // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
  910. // global variables become internal definitions. These have to be internal in
  911. // order to prevent name conflicts with global host variables with the same
  912. // name in a different TUs.
  913. //
  914. // For -fgpu-rdc, the shadow variables should not be internalized because
  915. // they may be accessed by different TU.
  916. if (CGM.getLangOpts().GPURelocatableDeviceCode)
  917. return;
  918. // __shared__ variables are odd. Shadows do get created, but
  919. // they are not registered with the CUDA runtime, so they
  920. // can't really be used to access their device-side
  921. // counterparts. It's not clear yet whether it's nvcc's bug or
  922. // a feature, but we've got to do the same for compatibility.
  923. if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
  924. D->hasAttr<CUDASharedAttr>() ||
  925. D->getType()->isCUDADeviceBuiltinSurfaceType() ||
  926. D->getType()->isCUDADeviceBuiltinTextureType()) {
  927. Linkage = llvm::GlobalValue::InternalLinkage;
  928. }
  929. }
  930. void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
  931. llvm::GlobalVariable &GV) {
  932. if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
  933. // Shadow variables and their properties must be registered with CUDA
  934. // runtime. Skip Extern global variables, which will be registered in
  935. // the TU where they are defined.
  936. //
  937. // Don't register a C++17 inline variable. The local symbol can be
  938. // discarded and referencing a discarded local symbol from outside the
  939. // comdat (__cuda_register_globals) is disallowed by the ELF spec.
  940. //
  941. // HIP managed variables need to be always recorded in device and host
  942. // compilations for transformation.
  943. //
  944. // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
  945. // added to llvm.compiler-used, therefore they are safe to be registered.
  946. if ((!D->hasExternalStorage() && !D->isInline()) ||
  947. CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) ||
  948. D->hasAttr<HIPManagedAttr>()) {
  949. registerDeviceVar(D, GV, !D->hasDefinition(),
  950. D->hasAttr<CUDAConstantAttr>());
  951. }
  952. } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
  953. D->getType()->isCUDADeviceBuiltinTextureType()) {
  954. // Builtin surfaces and textures and their template arguments are
  955. // also registered with CUDA runtime.
  956. const auto *TD = cast<ClassTemplateSpecializationDecl>(
  957. D->getType()->castAs<RecordType>()->getDecl());
  958. const TemplateArgumentList &Args = TD->getTemplateArgs();
  959. if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
  960. assert(Args.size() == 2 &&
  961. "Unexpected number of template arguments of CUDA device "
  962. "builtin surface type.");
  963. auto SurfType = Args[1].getAsIntegral();
  964. if (!D->hasExternalStorage())
  965. registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
  966. } else {
  967. assert(Args.size() == 3 &&
  968. "Unexpected number of template arguments of CUDA device "
  969. "builtin texture type.");
  970. auto TexType = Args[1].getAsIntegral();
  971. auto Normalized = Args[2].getAsIntegral();
  972. if (!D->hasExternalStorage())
  973. registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
  974. Normalized.getZExtValue());
  975. }
  976. }
  977. }
  978. // Transform managed variables to pointers to managed variables in device code.
  979. // Each use of the original managed variable is replaced by a load from the
  980. // transformed managed variable. The transformed managed variable contains
  981. // the address of managed memory which will be allocated by the runtime.
  982. void CGNVCUDARuntime::transformManagedVars() {
  983. for (auto &&Info : DeviceVars) {
  984. llvm::GlobalVariable *Var = Info.Var;
  985. if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
  986. Info.Flags.isManaged()) {
  987. auto *ManagedVar = new llvm::GlobalVariable(
  988. CGM.getModule(), Var->getType(),
  989. /*isConstant=*/false, Var->getLinkage(),
  990. /*Init=*/Var->isDeclaration()
  991. ? nullptr
  992. : llvm::ConstantPointerNull::get(Var->getType()),
  993. /*Name=*/"", /*InsertBefore=*/nullptr,
  994. llvm::GlobalVariable::NotThreadLocal,
  995. CGM.getContext().getTargetAddressSpace(LangAS::cuda_device));
  996. ManagedVar->setDSOLocal(Var->isDSOLocal());
  997. ManagedVar->setVisibility(Var->getVisibility());
  998. ManagedVar->setExternallyInitialized(true);
  999. replaceManagedVar(Var, ManagedVar);
  1000. ManagedVar->takeName(Var);
  1001. Var->setName(Twine(ManagedVar->getName()) + ".managed");
  1002. // Keep managed variables even if they are not used in device code since
  1003. // they need to be allocated by the runtime.
  1004. if (!Var->isDeclaration()) {
  1005. assert(!ManagedVar->isDeclaration());
  1006. CGM.addCompilerUsedGlobal(Var);
  1007. CGM.addCompilerUsedGlobal(ManagedVar);
  1008. }
  1009. }
  1010. }
  1011. }
  1012. // Returns module constructor to be added.
  1013. llvm::Function *CGNVCUDARuntime::finalizeModule() {
  1014. if (CGM.getLangOpts().CUDAIsDevice) {
  1015. transformManagedVars();
  1016. // Mark ODR-used device variables as compiler used to prevent it from being
  1017. // eliminated by optimization. This is necessary for device variables
  1018. // ODR-used by host functions. Sema correctly marks them as ODR-used no
  1019. // matter whether they are ODR-used by device or host functions.
  1020. //
  1021. // We do not need to do this if the variable has used attribute since it
  1022. // has already been added.
  1023. //
  1024. // Static device variables have been externalized at this point, therefore
  1025. // variables with LLVM private or internal linkage need not be added.
  1026. for (auto &&Info : DeviceVars) {
  1027. auto Kind = Info.Flags.getKind();
  1028. if (!Info.Var->isDeclaration() &&
  1029. !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
  1030. (Kind == DeviceVarFlags::Variable ||
  1031. Kind == DeviceVarFlags::Surface ||
  1032. Kind == DeviceVarFlags::Texture) &&
  1033. Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
  1034. CGM.addCompilerUsedGlobal(Info.Var);
  1035. }
  1036. }
  1037. return nullptr;
  1038. }
  1039. return makeModuleCtorFunction();
  1040. }
  1041. llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
  1042. GlobalDecl GD) {
  1043. auto Loc = KernelHandles.find(F);
  1044. if (Loc != KernelHandles.end())
  1045. return Loc->second;
  1046. if (!CGM.getLangOpts().HIP) {
  1047. KernelHandles[F] = F;
  1048. KernelStubs[F] = F;
  1049. return F;
  1050. }
  1051. auto *Var = new llvm::GlobalVariable(
  1052. TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
  1053. /*Initializer=*/nullptr,
  1054. CGM.getMangledName(
  1055. GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
  1056. Var->setAlignment(CGM.getPointerAlign().getAsAlign());
  1057. Var->setDSOLocal(F->isDSOLocal());
  1058. Var->setVisibility(F->getVisibility());
  1059. CGM.maybeSetTrivialComdat(*GD.getDecl(), *Var);
  1060. KernelHandles[F] = Var;
  1061. KernelStubs[Var] = F;
  1062. return Var;
  1063. }