Cuda.cpp 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912
  1. //===--- Cuda.cpp - Cuda Tool and ToolChain Implementations -----*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #include "Cuda.h"
  9. #include "CommonArgs.h"
  10. #include "clang/Basic/Cuda.h"
  11. #include "clang/Config/config.h"
  12. #include "clang/Driver/Compilation.h"
  13. #include "clang/Driver/Distro.h"
  14. #include "clang/Driver/Driver.h"
  15. #include "clang/Driver/DriverDiagnostic.h"
  16. #include "clang/Driver/InputInfo.h"
  17. #include "clang/Driver/Options.h"
  18. #include "llvm/ADT/Optional.h"
  19. #include "llvm/ADT/StringExtras.h"
  20. #include "llvm/Option/ArgList.h"
  21. #include "llvm/Support/FileSystem.h"
  22. #include "llvm/Support/Host.h"
  23. #include "llvm/Support/Path.h"
  24. #include "llvm/Support/Process.h"
  25. #include "llvm/Support/Program.h"
  26. #include "llvm/Support/TargetParser.h"
  27. #include "llvm/Support/VirtualFileSystem.h"
  28. #include <system_error>
  29. using namespace clang::driver;
  30. using namespace clang::driver::toolchains;
  31. using namespace clang::driver::tools;
  32. using namespace clang;
  33. using namespace llvm::opt;
  34. namespace {
  35. CudaVersion getCudaVersion(uint32_t raw_version) {
  36. if (raw_version < 7050)
  37. return CudaVersion::CUDA_70;
  38. if (raw_version < 8000)
  39. return CudaVersion::CUDA_75;
  40. if (raw_version < 9000)
  41. return CudaVersion::CUDA_80;
  42. if (raw_version < 9010)
  43. return CudaVersion::CUDA_90;
  44. if (raw_version < 9020)
  45. return CudaVersion::CUDA_91;
  46. if (raw_version < 10000)
  47. return CudaVersion::CUDA_92;
  48. if (raw_version < 10010)
  49. return CudaVersion::CUDA_100;
  50. if (raw_version < 10020)
  51. return CudaVersion::CUDA_101;
  52. if (raw_version < 11000)
  53. return CudaVersion::CUDA_102;
  54. if (raw_version < 11010)
  55. return CudaVersion::CUDA_110;
  56. if (raw_version < 11020)
  57. return CudaVersion::CUDA_111;
  58. if (raw_version < 11030)
  59. return CudaVersion::CUDA_112;
  60. if (raw_version < 11040)
  61. return CudaVersion::CUDA_113;
  62. if (raw_version < 11050)
  63. return CudaVersion::CUDA_114;
  64. if (raw_version < 11060)
  65. return CudaVersion::CUDA_115;
  66. return CudaVersion::NEW;
  67. }
  68. CudaVersion parseCudaHFile(llvm::StringRef Input) {
  69. // Helper lambda which skips the words if the line starts with them or returns
  70. // None otherwise.
  71. auto StartsWithWords =
  72. [](llvm::StringRef Line,
  73. const SmallVector<StringRef, 3> words) -> llvm::Optional<StringRef> {
  74. for (StringRef word : words) {
  75. if (!Line.consume_front(word))
  76. return {};
  77. Line = Line.ltrim();
  78. }
  79. return Line;
  80. };
  81. Input = Input.ltrim();
  82. while (!Input.empty()) {
  83. if (auto Line =
  84. StartsWithWords(Input.ltrim(), {"#", "define", "CUDA_VERSION"})) {
  85. uint32_t RawVersion;
  86. Line->consumeInteger(10, RawVersion);
  87. return getCudaVersion(RawVersion);
  88. }
  89. // Find next non-empty line.
  90. Input = Input.drop_front(Input.find_first_of("\n\r")).ltrim();
  91. }
  92. return CudaVersion::UNKNOWN;
  93. }
  94. } // namespace
  95. void CudaInstallationDetector::WarnIfUnsupportedVersion() {
  96. if (Version > CudaVersion::PARTIALLY_SUPPORTED) {
  97. std::string VersionString = CudaVersionToString(Version);
  98. if (!VersionString.empty())
  99. VersionString.insert(0, " ");
  100. D.Diag(diag::warn_drv_new_cuda_version)
  101. << VersionString
  102. << (CudaVersion::PARTIALLY_SUPPORTED != CudaVersion::FULLY_SUPPORTED)
  103. << CudaVersionToString(CudaVersion::PARTIALLY_SUPPORTED);
  104. } else if (Version > CudaVersion::FULLY_SUPPORTED)
  105. D.Diag(diag::warn_drv_partially_supported_cuda_version)
  106. << CudaVersionToString(Version);
  107. }
  108. CudaInstallationDetector::CudaInstallationDetector(
  109. const Driver &D, const llvm::Triple &HostTriple,
  110. const llvm::opt::ArgList &Args)
  111. : D(D) {
  112. struct Candidate {
  113. std::string Path;
  114. bool StrictChecking;
  115. Candidate(std::string Path, bool StrictChecking = false)
  116. : Path(Path), StrictChecking(StrictChecking) {}
  117. };
  118. SmallVector<Candidate, 4> Candidates;
  119. // In decreasing order so we prefer newer versions to older versions.
  120. std::initializer_list<const char *> Versions = {"8.0", "7.5", "7.0"};
  121. auto &FS = D.getVFS();
  122. if (Args.hasArg(clang::driver::options::OPT_cuda_path_EQ)) {
  123. Candidates.emplace_back(
  124. Args.getLastArgValue(clang::driver::options::OPT_cuda_path_EQ).str());
  125. } else if (HostTriple.isOSWindows()) {
  126. for (const char *Ver : Versions)
  127. Candidates.emplace_back(
  128. D.SysRoot + "/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v" +
  129. Ver);
  130. } else {
  131. if (!Args.hasArg(clang::driver::options::OPT_cuda_path_ignore_env)) {
  132. // Try to find ptxas binary. If the executable is located in a directory
  133. // called 'bin/', its parent directory might be a good guess for a valid
  134. // CUDA installation.
  135. // However, some distributions might installs 'ptxas' to /usr/bin. In that
  136. // case the candidate would be '/usr' which passes the following checks
  137. // because '/usr/include' exists as well. To avoid this case, we always
  138. // check for the directory potentially containing files for libdevice,
  139. // even if the user passes -nocudalib.
  140. if (llvm::ErrorOr<std::string> ptxas =
  141. llvm::sys::findProgramByName("ptxas")) {
  142. SmallString<256> ptxasAbsolutePath;
  143. llvm::sys::fs::real_path(*ptxas, ptxasAbsolutePath);
  144. StringRef ptxasDir = llvm::sys::path::parent_path(ptxasAbsolutePath);
  145. if (llvm::sys::path::filename(ptxasDir) == "bin")
  146. Candidates.emplace_back(
  147. std::string(llvm::sys::path::parent_path(ptxasDir)),
  148. /*StrictChecking=*/true);
  149. }
  150. }
  151. Candidates.emplace_back(D.SysRoot + "/usr/local/cuda");
  152. for (const char *Ver : Versions)
  153. Candidates.emplace_back(D.SysRoot + "/usr/local/cuda-" + Ver);
  154. Distro Dist(FS, llvm::Triple(llvm::sys::getProcessTriple()));
  155. if (Dist.IsDebian() || Dist.IsUbuntu())
  156. // Special case for Debian to have nvidia-cuda-toolkit work
  157. // out of the box. More info on http://bugs.debian.org/882505
  158. Candidates.emplace_back(D.SysRoot + "/usr/lib/cuda");
  159. }
  160. bool NoCudaLib = Args.hasArg(options::OPT_nogpulib);
  161. for (const auto &Candidate : Candidates) {
  162. InstallPath = Candidate.Path;
  163. if (InstallPath.empty() || !FS.exists(InstallPath))
  164. continue;
  165. BinPath = InstallPath + "/bin";
  166. IncludePath = InstallPath + "/include";
  167. LibDevicePath = InstallPath + "/nvvm/libdevice";
  168. if (!(FS.exists(IncludePath) && FS.exists(BinPath)))
  169. continue;
  170. bool CheckLibDevice = (!NoCudaLib || Candidate.StrictChecking);
  171. if (CheckLibDevice && !FS.exists(LibDevicePath))
  172. continue;
  173. // On Linux, we have both lib and lib64 directories, and we need to choose
  174. // based on our triple. On MacOS, we have only a lib directory.
  175. //
  176. // It's sufficient for our purposes to be flexible: If both lib and lib64
  177. // exist, we choose whichever one matches our triple. Otherwise, if only
  178. // lib exists, we use it.
  179. if (HostTriple.isArch64Bit() && FS.exists(InstallPath + "/lib64"))
  180. LibPath = InstallPath + "/lib64";
  181. else if (FS.exists(InstallPath + "/lib"))
  182. LibPath = InstallPath + "/lib";
  183. else
  184. continue;
  185. Version = CudaVersion::UNKNOWN;
  186. if (auto CudaHFile = FS.getBufferForFile(InstallPath + "/include/cuda.h"))
  187. Version = parseCudaHFile((*CudaHFile)->getBuffer());
  188. // As the last resort, make an educated guess between CUDA-7.0, which had
  189. // old-style libdevice bitcode, and an unknown recent CUDA version.
  190. if (Version == CudaVersion::UNKNOWN) {
  191. Version = FS.exists(LibDevicePath + "/libdevice.10.bc")
  192. ? CudaVersion::NEW
  193. : CudaVersion::CUDA_70;
  194. }
  195. if (Version >= CudaVersion::CUDA_90) {
  196. // CUDA-9+ uses single libdevice file for all GPU variants.
  197. std::string FilePath = LibDevicePath + "/libdevice.10.bc";
  198. if (FS.exists(FilePath)) {
  199. for (int Arch = (int)CudaArch::SM_30, E = (int)CudaArch::LAST; Arch < E;
  200. ++Arch) {
  201. CudaArch GpuArch = static_cast<CudaArch>(Arch);
  202. if (!IsNVIDIAGpuArch(GpuArch))
  203. continue;
  204. std::string GpuArchName(CudaArchToString(GpuArch));
  205. LibDeviceMap[GpuArchName] = FilePath;
  206. }
  207. }
  208. } else {
  209. std::error_code EC;
  210. for (llvm::vfs::directory_iterator LI = FS.dir_begin(LibDevicePath, EC),
  211. LE;
  212. !EC && LI != LE; LI = LI.increment(EC)) {
  213. StringRef FilePath = LI->path();
  214. StringRef FileName = llvm::sys::path::filename(FilePath);
  215. // Process all bitcode filenames that look like
  216. // libdevice.compute_XX.YY.bc
  217. const StringRef LibDeviceName = "libdevice.";
  218. if (!(FileName.startswith(LibDeviceName) && FileName.endswith(".bc")))
  219. continue;
  220. StringRef GpuArch = FileName.slice(
  221. LibDeviceName.size(), FileName.find('.', LibDeviceName.size()));
  222. LibDeviceMap[GpuArch] = FilePath.str();
  223. // Insert map entries for specific devices with this compute
  224. // capability. NVCC's choice of the libdevice library version is
  225. // rather peculiar and depends on the CUDA version.
  226. if (GpuArch == "compute_20") {
  227. LibDeviceMap["sm_20"] = std::string(FilePath);
  228. LibDeviceMap["sm_21"] = std::string(FilePath);
  229. LibDeviceMap["sm_32"] = std::string(FilePath);
  230. } else if (GpuArch == "compute_30") {
  231. LibDeviceMap["sm_30"] = std::string(FilePath);
  232. if (Version < CudaVersion::CUDA_80) {
  233. LibDeviceMap["sm_50"] = std::string(FilePath);
  234. LibDeviceMap["sm_52"] = std::string(FilePath);
  235. LibDeviceMap["sm_53"] = std::string(FilePath);
  236. }
  237. LibDeviceMap["sm_60"] = std::string(FilePath);
  238. LibDeviceMap["sm_61"] = std::string(FilePath);
  239. LibDeviceMap["sm_62"] = std::string(FilePath);
  240. } else if (GpuArch == "compute_35") {
  241. LibDeviceMap["sm_35"] = std::string(FilePath);
  242. LibDeviceMap["sm_37"] = std::string(FilePath);
  243. } else if (GpuArch == "compute_50") {
  244. if (Version >= CudaVersion::CUDA_80) {
  245. LibDeviceMap["sm_50"] = std::string(FilePath);
  246. LibDeviceMap["sm_52"] = std::string(FilePath);
  247. LibDeviceMap["sm_53"] = std::string(FilePath);
  248. }
  249. }
  250. }
  251. }
  252. // Check that we have found at least one libdevice that we can link in if
  253. // -nocudalib hasn't been specified.
  254. if (LibDeviceMap.empty() && !NoCudaLib)
  255. continue;
  256. IsValid = true;
  257. break;
  258. }
  259. }
  260. void CudaInstallationDetector::AddCudaIncludeArgs(
  261. const ArgList &DriverArgs, ArgStringList &CC1Args) const {
  262. if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
  263. // Add cuda_wrappers/* to our system include path. This lets us wrap
  264. // standard library headers.
  265. SmallString<128> P(D.ResourceDir);
  266. llvm::sys::path::append(P, "include");
  267. llvm::sys::path::append(P, "cuda_wrappers");
  268. CC1Args.push_back("-internal-isystem");
  269. CC1Args.push_back(DriverArgs.MakeArgString(P));
  270. }
  271. if (DriverArgs.hasArg(options::OPT_nogpuinc))
  272. return;
  273. if (!isValid()) {
  274. D.Diag(diag::err_drv_no_cuda_installation);
  275. return;
  276. }
  277. CC1Args.push_back("-include");
  278. CC1Args.push_back("__clang_cuda_runtime_wrapper.h");
  279. }
  280. void CudaInstallationDetector::CheckCudaVersionSupportsArch(
  281. CudaArch Arch) const {
  282. if (Arch == CudaArch::UNKNOWN || Version == CudaVersion::UNKNOWN ||
  283. ArchsWithBadVersion[(int)Arch])
  284. return;
  285. auto MinVersion = MinVersionForCudaArch(Arch);
  286. auto MaxVersion = MaxVersionForCudaArch(Arch);
  287. if (Version < MinVersion || Version > MaxVersion) {
  288. ArchsWithBadVersion[(int)Arch] = true;
  289. D.Diag(diag::err_drv_cuda_version_unsupported)
  290. << CudaArchToString(Arch) << CudaVersionToString(MinVersion)
  291. << CudaVersionToString(MaxVersion) << InstallPath
  292. << CudaVersionToString(Version);
  293. }
  294. }
  295. void CudaInstallationDetector::print(raw_ostream &OS) const {
  296. if (isValid())
  297. OS << "Found CUDA installation: " << InstallPath << ", version "
  298. << CudaVersionToString(Version) << "\n";
  299. }
  300. namespace {
  301. /// Debug info level for the NVPTX devices. We may need to emit different debug
  302. /// info level for the host and for the device itselfi. This type controls
  303. /// emission of the debug info for the devices. It either prohibits disable info
  304. /// emission completely, or emits debug directives only, or emits same debug
  305. /// info as for the host.
  306. enum DeviceDebugInfoLevel {
  307. DisableDebugInfo, /// Do not emit debug info for the devices.
  308. DebugDirectivesOnly, /// Emit only debug directives.
  309. EmitSameDebugInfoAsHost, /// Use the same debug info level just like for the
  310. /// host.
  311. };
  312. } // anonymous namespace
  313. /// Define debug info level for the NVPTX devices. If the debug info for both
  314. /// the host and device are disabled (-g0/-ggdb0 or no debug options at all). If
  315. /// only debug directives are requested for the both host and device
  316. /// (-gline-directvies-only), or the debug info only for the device is disabled
  317. /// (optimization is on and --cuda-noopt-device-debug was not specified), the
  318. /// debug directves only must be emitted for the device. Otherwise, use the same
  319. /// debug info level just like for the host (with the limitations of only
  320. /// supported DWARF2 standard).
  321. static DeviceDebugInfoLevel mustEmitDebugInfo(const ArgList &Args) {
  322. const Arg *A = Args.getLastArg(options::OPT_O_Group);
  323. bool IsDebugEnabled = !A || A->getOption().matches(options::OPT_O0) ||
  324. Args.hasFlag(options::OPT_cuda_noopt_device_debug,
  325. options::OPT_no_cuda_noopt_device_debug,
  326. /*Default=*/false);
  327. if (const Arg *A = Args.getLastArg(options::OPT_g_Group)) {
  328. const Option &Opt = A->getOption();
  329. if (Opt.matches(options::OPT_gN_Group)) {
  330. if (Opt.matches(options::OPT_g0) || Opt.matches(options::OPT_ggdb0))
  331. return DisableDebugInfo;
  332. if (Opt.matches(options::OPT_gline_directives_only))
  333. return DebugDirectivesOnly;
  334. }
  335. return IsDebugEnabled ? EmitSameDebugInfoAsHost : DebugDirectivesOnly;
  336. }
  337. return willEmitRemarks(Args) ? DebugDirectivesOnly : DisableDebugInfo;
  338. }
  339. void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
  340. const InputInfo &Output,
  341. const InputInfoList &Inputs,
  342. const ArgList &Args,
  343. const char *LinkingOutput) const {
  344. const auto &TC =
  345. static_cast<const toolchains::CudaToolChain &>(getToolChain());
  346. assert(TC.getTriple().isNVPTX() && "Wrong platform");
  347. StringRef GPUArchName;
  348. // If this is an OpenMP action we need to extract the device architecture
  349. // from the -march=arch option. This option may come from -Xopenmp-target
  350. // flag or the default value.
  351. if (JA.isDeviceOffloading(Action::OFK_OpenMP)) {
  352. GPUArchName = Args.getLastArgValue(options::OPT_march_EQ);
  353. assert(!GPUArchName.empty() && "Must have an architecture passed in.");
  354. } else
  355. GPUArchName = JA.getOffloadingArch();
  356. // Obtain architecture from the action.
  357. CudaArch gpu_arch = StringToCudaArch(GPUArchName);
  358. assert(gpu_arch != CudaArch::UNKNOWN &&
  359. "Device action expected to have an architecture.");
  360. // Check that our installation's ptxas supports gpu_arch.
  361. if (!Args.hasArg(options::OPT_no_cuda_version_check)) {
  362. TC.CudaInstallation.CheckCudaVersionSupportsArch(gpu_arch);
  363. }
  364. ArgStringList CmdArgs;
  365. CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-m64" : "-m32");
  366. DeviceDebugInfoLevel DIKind = mustEmitDebugInfo(Args);
  367. if (DIKind == EmitSameDebugInfoAsHost) {
  368. // ptxas does not accept -g option if optimization is enabled, so
  369. // we ignore the compiler's -O* options if we want debug info.
  370. CmdArgs.push_back("-g");
  371. CmdArgs.push_back("--dont-merge-basicblocks");
  372. CmdArgs.push_back("--return-at-end");
  373. } else if (Arg *A = Args.getLastArg(options::OPT_O_Group)) {
  374. // Map the -O we received to -O{0,1,2,3}.
  375. //
  376. // TODO: Perhaps we should map host -O2 to ptxas -O3. -O3 is ptxas's
  377. // default, so it may correspond more closely to the spirit of clang -O2.
  378. // -O3 seems like the least-bad option when -Osomething is specified to
  379. // clang but it isn't handled below.
  380. StringRef OOpt = "3";
  381. if (A->getOption().matches(options::OPT_O4) ||
  382. A->getOption().matches(options::OPT_Ofast))
  383. OOpt = "3";
  384. else if (A->getOption().matches(options::OPT_O0))
  385. OOpt = "0";
  386. else if (A->getOption().matches(options::OPT_O)) {
  387. // -Os, -Oz, and -O(anything else) map to -O2, for lack of better options.
  388. OOpt = llvm::StringSwitch<const char *>(A->getValue())
  389. .Case("1", "1")
  390. .Case("2", "2")
  391. .Case("3", "3")
  392. .Case("s", "2")
  393. .Case("z", "2")
  394. .Default("2");
  395. }
  396. CmdArgs.push_back(Args.MakeArgString(llvm::Twine("-O") + OOpt));
  397. } else {
  398. // If no -O was passed, pass -O0 to ptxas -- no opt flag should correspond
  399. // to no optimizations, but ptxas's default is -O3.
  400. CmdArgs.push_back("-O0");
  401. }
  402. if (DIKind == DebugDirectivesOnly)
  403. CmdArgs.push_back("-lineinfo");
  404. // Pass -v to ptxas if it was passed to the driver.
  405. if (Args.hasArg(options::OPT_v))
  406. CmdArgs.push_back("-v");
  407. CmdArgs.push_back("--gpu-name");
  408. CmdArgs.push_back(Args.MakeArgString(CudaArchToString(gpu_arch)));
  409. CmdArgs.push_back("--output-file");
  410. CmdArgs.push_back(Args.MakeArgString(TC.getInputFilename(Output)));
  411. for (const auto& II : Inputs)
  412. CmdArgs.push_back(Args.MakeArgString(II.getFilename()));
  413. for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas))
  414. CmdArgs.push_back(Args.MakeArgString(A));
  415. bool Relocatable = false;
  416. if (JA.isOffloading(Action::OFK_OpenMP))
  417. // In OpenMP we need to generate relocatable code.
  418. Relocatable = Args.hasFlag(options::OPT_fopenmp_relocatable_target,
  419. options::OPT_fnoopenmp_relocatable_target,
  420. /*Default=*/true);
  421. else if (JA.isOffloading(Action::OFK_Cuda))
  422. Relocatable = Args.hasFlag(options::OPT_fgpu_rdc,
  423. options::OPT_fno_gpu_rdc, /*Default=*/false);
  424. if (Relocatable)
  425. CmdArgs.push_back("-c");
  426. const char *Exec;
  427. if (Arg *A = Args.getLastArg(options::OPT_ptxas_path_EQ))
  428. Exec = A->getValue();
  429. else
  430. Exec = Args.MakeArgString(TC.GetProgramPath("ptxas"));
  431. C.addCommand(std::make_unique<Command>(
  432. JA, *this,
  433. ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8,
  434. "--options-file"},
  435. Exec, CmdArgs, Inputs, Output));
  436. }
  437. static bool shouldIncludePTX(const ArgList &Args, const char *gpu_arch) {
  438. bool includePTX = true;
  439. for (Arg *A : Args) {
  440. if (!(A->getOption().matches(options::OPT_cuda_include_ptx_EQ) ||
  441. A->getOption().matches(options::OPT_no_cuda_include_ptx_EQ)))
  442. continue;
  443. A->claim();
  444. const StringRef ArchStr = A->getValue();
  445. if (ArchStr == "all" || ArchStr == gpu_arch) {
  446. includePTX = A->getOption().matches(options::OPT_cuda_include_ptx_EQ);
  447. continue;
  448. }
  449. }
  450. return includePTX;
  451. }
  452. // All inputs to this linker must be from CudaDeviceActions, as we need to look
  453. // at the Inputs' Actions in order to figure out which GPU architecture they
  454. // correspond to.
  455. void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
  456. const InputInfo &Output,
  457. const InputInfoList &Inputs,
  458. const ArgList &Args,
  459. const char *LinkingOutput) const {
  460. const auto &TC =
  461. static_cast<const toolchains::CudaToolChain &>(getToolChain());
  462. assert(TC.getTriple().isNVPTX() && "Wrong platform");
  463. ArgStringList CmdArgs;
  464. if (TC.CudaInstallation.version() <= CudaVersion::CUDA_100)
  465. CmdArgs.push_back("--cuda");
  466. CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-64" : "-32");
  467. CmdArgs.push_back(Args.MakeArgString("--create"));
  468. CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
  469. if (mustEmitDebugInfo(Args) == EmitSameDebugInfoAsHost)
  470. CmdArgs.push_back("-g");
  471. for (const auto& II : Inputs) {
  472. auto *A = II.getAction();
  473. assert(A->getInputs().size() == 1 &&
  474. "Device offload action is expected to have a single input");
  475. const char *gpu_arch_str = A->getOffloadingArch();
  476. assert(gpu_arch_str &&
  477. "Device action expected to have associated a GPU architecture!");
  478. CudaArch gpu_arch = StringToCudaArch(gpu_arch_str);
  479. if (II.getType() == types::TY_PP_Asm &&
  480. !shouldIncludePTX(Args, gpu_arch_str))
  481. continue;
  482. // We need to pass an Arch of the form "sm_XX" for cubin files and
  483. // "compute_XX" for ptx.
  484. const char *Arch = (II.getType() == types::TY_PP_Asm)
  485. ? CudaArchToVirtualArchString(gpu_arch)
  486. : gpu_arch_str;
  487. CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") +
  488. Arch + ",file=" + II.getFilename()));
  489. }
  490. for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_fatbinary))
  491. CmdArgs.push_back(Args.MakeArgString(A));
  492. const char *Exec = Args.MakeArgString(TC.GetProgramPath("fatbinary"));
  493. C.addCommand(std::make_unique<Command>(
  494. JA, *this,
  495. ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8,
  496. "--options-file"},
  497. Exec, CmdArgs, Inputs, Output));
  498. }
  499. void NVPTX::OpenMPLinker::ConstructJob(Compilation &C, const JobAction &JA,
  500. const InputInfo &Output,
  501. const InputInfoList &Inputs,
  502. const ArgList &Args,
  503. const char *LinkingOutput) const {
  504. const auto &TC =
  505. static_cast<const toolchains::CudaToolChain &>(getToolChain());
  506. assert(TC.getTriple().isNVPTX() && "Wrong platform");
  507. ArgStringList CmdArgs;
  508. // OpenMP uses nvlink to link cubin files. The result will be embedded in the
  509. // host binary by the host linker.
  510. assert(!JA.isHostOffloading(Action::OFK_OpenMP) &&
  511. "CUDA toolchain not expected for an OpenMP host device.");
  512. if (Output.isFilename()) {
  513. CmdArgs.push_back("-o");
  514. CmdArgs.push_back(Output.getFilename());
  515. } else
  516. assert(Output.isNothing() && "Invalid output.");
  517. if (mustEmitDebugInfo(Args) == EmitSameDebugInfoAsHost)
  518. CmdArgs.push_back("-g");
  519. if (Args.hasArg(options::OPT_v))
  520. CmdArgs.push_back("-v");
  521. StringRef GPUArch =
  522. Args.getLastArgValue(options::OPT_march_EQ);
  523. assert(!GPUArch.empty() && "At least one GPU Arch required for ptxas.");
  524. CmdArgs.push_back("-arch");
  525. CmdArgs.push_back(Args.MakeArgString(GPUArch));
  526. // Add paths specified in LIBRARY_PATH environment variable as -L options.
  527. addDirectoryList(Args, CmdArgs, "-L", "LIBRARY_PATH");
  528. // Add paths for the default clang library path.
  529. SmallString<256> DefaultLibPath =
  530. llvm::sys::path::parent_path(TC.getDriver().Dir);
  531. llvm::sys::path::append(DefaultLibPath, "lib" CLANG_LIBDIR_SUFFIX);
  532. CmdArgs.push_back(Args.MakeArgString(Twine("-L") + DefaultLibPath));
  533. for (const auto &II : Inputs) {
  534. if (II.getType() == types::TY_LLVM_IR ||
  535. II.getType() == types::TY_LTO_IR ||
  536. II.getType() == types::TY_LTO_BC ||
  537. II.getType() == types::TY_LLVM_BC) {
  538. C.getDriver().Diag(diag::err_drv_no_linker_llvm_support)
  539. << getToolChain().getTripleString();
  540. continue;
  541. }
  542. // Currently, we only pass the input files to the linker, we do not pass
  543. // any libraries that may be valid only for the host.
  544. if (!II.isFilename())
  545. continue;
  546. const char *CubinF = C.addTempFile(
  547. C.getArgs().MakeArgString(getToolChain().getInputFilename(II)));
  548. CmdArgs.push_back(CubinF);
  549. }
  550. AddStaticDeviceLibsLinking(C, *this, JA, Inputs, Args, CmdArgs, "nvptx",
  551. GPUArch, /*isBitCodeSDL=*/false,
  552. /*postClangLink=*/false);
  553. // Find nvlink and pass it as "--nvlink-path=" argument of
  554. // clang-nvlink-wrapper.
  555. CmdArgs.push_back(Args.MakeArgString(
  556. Twine("--nvlink-path=" + getToolChain().GetProgramPath("nvlink"))));
  557. const char *Exec =
  558. Args.MakeArgString(getToolChain().GetProgramPath("clang-nvlink-wrapper"));
  559. C.addCommand(std::make_unique<Command>(
  560. JA, *this,
  561. ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8,
  562. "--options-file"},
  563. Exec, CmdArgs, Inputs, Output));
  564. }
  565. /// CUDA toolchain. Our assembler is ptxas, and our "linker" is fatbinary,
  566. /// which isn't properly a linker but nonetheless performs the step of stitching
  567. /// together object files from the assembler into a single blob.
  568. CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
  569. const ToolChain &HostTC, const ArgList &Args,
  570. const Action::OffloadKind OK)
  571. : ToolChain(D, Triple, Args), HostTC(HostTC),
  572. CudaInstallation(D, HostTC.getTriple(), Args), OK(OK) {
  573. if (CudaInstallation.isValid()) {
  574. CudaInstallation.WarnIfUnsupportedVersion();
  575. getProgramPaths().push_back(std::string(CudaInstallation.getBinPath()));
  576. }
  577. // Lookup binaries into the driver directory, this is used to
  578. // discover the clang-offload-bundler executable.
  579. getProgramPaths().push_back(getDriver().Dir);
  580. }
  581. std::string CudaToolChain::getInputFilename(const InputInfo &Input) const {
  582. // Only object files are changed, for example assembly files keep their .s
  583. // extensions. CUDA also continues to use .o as they don't use nvlink but
  584. // fatbinary.
  585. if (!(OK == Action::OFK_OpenMP && Input.getType() == types::TY_Object))
  586. return ToolChain::getInputFilename(Input);
  587. // Replace extension for object files with cubin because nvlink relies on
  588. // these particular file names.
  589. SmallString<256> Filename(ToolChain::getInputFilename(Input));
  590. llvm::sys::path::replace_extension(Filename, "cubin");
  591. return std::string(Filename.str());
  592. }
  593. void CudaToolChain::addClangTargetOptions(
  594. const llvm::opt::ArgList &DriverArgs,
  595. llvm::opt::ArgStringList &CC1Args,
  596. Action::OffloadKind DeviceOffloadingKind) const {
  597. HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind);
  598. StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
  599. assert(!GpuArch.empty() && "Must have an explicit GPU arch.");
  600. assert((DeviceOffloadingKind == Action::OFK_OpenMP ||
  601. DeviceOffloadingKind == Action::OFK_Cuda) &&
  602. "Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs.");
  603. if (DeviceOffloadingKind == Action::OFK_Cuda) {
  604. CC1Args.append(
  605. {"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"});
  606. if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
  607. options::OPT_fno_cuda_approx_transcendentals, false))
  608. CC1Args.push_back("-fcuda-approx-transcendentals");
  609. }
  610. if (DriverArgs.hasArg(options::OPT_nogpulib))
  611. return;
  612. if (DeviceOffloadingKind == Action::OFK_OpenMP &&
  613. DriverArgs.hasArg(options::OPT_S))
  614. return;
  615. std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch);
  616. if (LibDeviceFile.empty()) {
  617. getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch;
  618. return;
  619. }
  620. CC1Args.push_back("-mlink-builtin-bitcode");
  621. CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
  622. clang::CudaVersion CudaInstallationVersion = CudaInstallation.version();
  623. // New CUDA versions often introduce new instructions that are only supported
  624. // by new PTX version, so we need to raise PTX level to enable them in NVPTX
  625. // back-end.
  626. const char *PtxFeature = nullptr;
  627. switch (CudaInstallationVersion) {
  628. #define CASE_CUDA_VERSION(CUDA_VER, PTX_VER) \
  629. case CudaVersion::CUDA_##CUDA_VER: \
  630. PtxFeature = "+ptx" #PTX_VER; \
  631. break;
  632. CASE_CUDA_VERSION(115, 75);
  633. CASE_CUDA_VERSION(114, 74);
  634. CASE_CUDA_VERSION(113, 73);
  635. CASE_CUDA_VERSION(112, 72);
  636. CASE_CUDA_VERSION(111, 71);
  637. CASE_CUDA_VERSION(110, 70);
  638. CASE_CUDA_VERSION(102, 65);
  639. CASE_CUDA_VERSION(101, 64);
  640. CASE_CUDA_VERSION(100, 63);
  641. CASE_CUDA_VERSION(92, 61);
  642. CASE_CUDA_VERSION(91, 61);
  643. CASE_CUDA_VERSION(90, 60);
  644. #undef CASE_CUDA_VERSION
  645. default:
  646. PtxFeature = "+ptx42";
  647. }
  648. CC1Args.append({"-target-feature", PtxFeature});
  649. if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr,
  650. options::OPT_fno_cuda_short_ptr, false))
  651. CC1Args.append({"-mllvm", "--nvptx-short-ptr"});
  652. if (CudaInstallationVersion >= CudaVersion::UNKNOWN)
  653. CC1Args.push_back(
  654. DriverArgs.MakeArgString(Twine("-target-sdk-version=") +
  655. CudaVersionToString(CudaInstallationVersion)));
  656. if (DeviceOffloadingKind == Action::OFK_OpenMP) {
  657. if (CudaInstallationVersion < CudaVersion::CUDA_92) {
  658. getDriver().Diag(
  659. diag::err_drv_omp_offload_target_cuda_version_not_support)
  660. << CudaVersionToString(CudaInstallationVersion);
  661. return;
  662. }
  663. // Link the bitcode library late if we're using device LTO.
  664. if (getDriver().isUsingLTO(/* IsOffload */ true))
  665. return;
  666. std::string BitcodeSuffix;
  667. if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
  668. options::OPT_fno_openmp_target_new_runtime, true))
  669. BitcodeSuffix = "new-nvptx-" + GpuArch.str();
  670. else
  671. BitcodeSuffix = "nvptx-" + GpuArch.str();
  672. addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, BitcodeSuffix,
  673. getTriple());
  674. AddStaticDeviceLibsPostLinking(getDriver(), DriverArgs, CC1Args, "nvptx",
  675. GpuArch, /*isBitCodeSDL=*/true,
  676. /*postClangLink=*/true);
  677. }
  678. }
  679. llvm::DenormalMode CudaToolChain::getDefaultDenormalModeForType(
  680. const llvm::opt::ArgList &DriverArgs, const JobAction &JA,
  681. const llvm::fltSemantics *FPType) const {
  682. if (JA.getOffloadingDeviceKind() == Action::OFK_Cuda) {
  683. if (FPType && FPType == &llvm::APFloat::IEEEsingle() &&
  684. DriverArgs.hasFlag(options::OPT_fgpu_flush_denormals_to_zero,
  685. options::OPT_fno_gpu_flush_denormals_to_zero, false))
  686. return llvm::DenormalMode::getPreserveSign();
  687. }
  688. assert(JA.getOffloadingDeviceKind() != Action::OFK_Host);
  689. return llvm::DenormalMode::getIEEE();
  690. }
  691. bool CudaToolChain::supportsDebugInfoOption(const llvm::opt::Arg *A) const {
  692. const Option &O = A->getOption();
  693. return (O.matches(options::OPT_gN_Group) &&
  694. !O.matches(options::OPT_gmodules)) ||
  695. O.matches(options::OPT_g_Flag) ||
  696. O.matches(options::OPT_ggdbN_Group) || O.matches(options::OPT_ggdb) ||
  697. O.matches(options::OPT_gdwarf) || O.matches(options::OPT_gdwarf_2) ||
  698. O.matches(options::OPT_gdwarf_3) || O.matches(options::OPT_gdwarf_4) ||
  699. O.matches(options::OPT_gdwarf_5) ||
  700. O.matches(options::OPT_gcolumn_info);
  701. }
  702. void CudaToolChain::adjustDebugInfoKind(
  703. codegenoptions::DebugInfoKind &DebugInfoKind, const ArgList &Args) const {
  704. switch (mustEmitDebugInfo(Args)) {
  705. case DisableDebugInfo:
  706. DebugInfoKind = codegenoptions::NoDebugInfo;
  707. break;
  708. case DebugDirectivesOnly:
  709. DebugInfoKind = codegenoptions::DebugDirectivesOnly;
  710. break;
  711. case EmitSameDebugInfoAsHost:
  712. // Use same debug info level as the host.
  713. break;
  714. }
  715. }
  716. void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
  717. ArgStringList &CC1Args) const {
  718. // Check our CUDA version if we're going to include the CUDA headers.
  719. if (!DriverArgs.hasArg(options::OPT_nogpuinc) &&
  720. !DriverArgs.hasArg(options::OPT_no_cuda_version_check)) {
  721. StringRef Arch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
  722. assert(!Arch.empty() && "Must have an explicit GPU arch.");
  723. CudaInstallation.CheckCudaVersionSupportsArch(StringToCudaArch(Arch));
  724. }
  725. CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
  726. }
  727. llvm::opt::DerivedArgList *
  728. CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
  729. StringRef BoundArch,
  730. Action::OffloadKind DeviceOffloadKind) const {
  731. DerivedArgList *DAL =
  732. HostTC.TranslateArgs(Args, BoundArch, DeviceOffloadKind);
  733. if (!DAL)
  734. DAL = new DerivedArgList(Args.getBaseArgs());
  735. const OptTable &Opts = getDriver().getOpts();
  736. // For OpenMP device offloading, append derived arguments. Make sure
  737. // flags are not duplicated.
  738. // Also append the compute capability.
  739. if (DeviceOffloadKind == Action::OFK_OpenMP) {
  740. for (Arg *A : Args)
  741. if (!llvm::is_contained(*DAL, A))
  742. DAL->append(A);
  743. StringRef Arch = DAL->getLastArgValue(options::OPT_march_EQ);
  744. if (Arch.empty())
  745. DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ),
  746. CLANG_OPENMP_NVPTX_DEFAULT_ARCH);
  747. return DAL;
  748. }
  749. for (Arg *A : Args) {
  750. DAL->append(A);
  751. }
  752. if (!BoundArch.empty()) {
  753. DAL->eraseArg(options::OPT_march_EQ);
  754. DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch);
  755. }
  756. return DAL;
  757. }
  758. Tool *CudaToolChain::buildAssembler() const {
  759. return new tools::NVPTX::Assembler(*this);
  760. }
  761. Tool *CudaToolChain::buildLinker() const {
  762. if (OK == Action::OFK_OpenMP)
  763. return new tools::NVPTX::OpenMPLinker(*this);
  764. return new tools::NVPTX::Linker(*this);
  765. }
  766. void CudaToolChain::addClangWarningOptions(ArgStringList &CC1Args) const {
  767. HostTC.addClangWarningOptions(CC1Args);
  768. }
  769. ToolChain::CXXStdlibType
  770. CudaToolChain::GetCXXStdlibType(const ArgList &Args) const {
  771. return HostTC.GetCXXStdlibType(Args);
  772. }
  773. void CudaToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
  774. ArgStringList &CC1Args) const {
  775. HostTC.AddClangSystemIncludeArgs(DriverArgs, CC1Args);
  776. if (!DriverArgs.hasArg(options::OPT_nogpuinc) && CudaInstallation.isValid())
  777. CC1Args.append(
  778. {"-internal-isystem",
  779. DriverArgs.MakeArgString(CudaInstallation.getIncludePath())});
  780. }
  781. void CudaToolChain::AddClangCXXStdlibIncludeArgs(const ArgList &Args,
  782. ArgStringList &CC1Args) const {
  783. HostTC.AddClangCXXStdlibIncludeArgs(Args, CC1Args);
  784. }
  785. void CudaToolChain::AddIAMCUIncludeArgs(const ArgList &Args,
  786. ArgStringList &CC1Args) const {
  787. HostTC.AddIAMCUIncludeArgs(Args, CC1Args);
  788. }
  789. SanitizerMask CudaToolChain::getSupportedSanitizers() const {
  790. // The CudaToolChain only supports sanitizers in the sense that it allows
  791. // sanitizer arguments on the command line if they are supported by the host
  792. // toolchain. The CudaToolChain will actually ignore any command line
  793. // arguments for any of these "supported" sanitizers. That means that no
  794. // sanitization of device code is actually supported at this time.
  795. //
  796. // This behavior is necessary because the host and device toolchains
  797. // invocations often share the command line, so the device toolchain must
  798. // tolerate flags meant only for the host toolchain.
  799. return HostTC.getSupportedSanitizers();
  800. }
  801. VersionTuple CudaToolChain::computeMSVCVersion(const Driver *D,
  802. const ArgList &Args) const {
  803. return HostTC.computeMSVCVersion(D, Args);
  804. }