NVPTXTargetMachine.cpp 16 KB


  1. //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // Top-level implementation for the NVPTX target.
  10. //
  11. //===----------------------------------------------------------------------===//
  12. #include "NVPTXTargetMachine.h"
  13. #include "NVPTX.h"
  14. #include "NVPTXAllocaHoisting.h"
  15. #include "NVPTXAtomicLower.h"
  16. #include "NVPTXLowerAggrCopies.h"
  17. #include "NVPTXTargetObjectFile.h"
  18. #include "NVPTXTargetTransformInfo.h"
  19. #include "TargetInfo/NVPTXTargetInfo.h"
  20. #include "llvm/ADT/STLExtras.h"
  21. #include "llvm/ADT/Triple.h"
  22. #include "llvm/Analysis/TargetTransformInfo.h"
  23. #include "llvm/CodeGen/Passes.h"
  24. #include "llvm/CodeGen/TargetPassConfig.h"
  25. #include "llvm/IR/IntrinsicsNVPTX.h"
  26. #include "llvm/IR/LegacyPassManager.h"
  27. #include "llvm/MC/TargetRegistry.h"
  28. #include "llvm/Pass.h"
  29. #include "llvm/Passes/PassBuilder.h"
  30. #include "llvm/Support/CommandLine.h"
  31. #include "llvm/Target/TargetMachine.h"
  32. #include "llvm/Target/TargetOptions.h"
  33. #include "llvm/Transforms/IPO/PassManagerBuilder.h"
  34. #include "llvm/Transforms/Scalar.h"
  35. #include "llvm/Transforms/Scalar/GVN.h"
  36. #include "llvm/Transforms/Vectorize.h"
  37. #include <cassert>
  38. #include <string>
  39. using namespace llvm;
  40. // LSV is still relatively new; this switch lets us turn it off in case we
  41. // encounter (or suspect) a bug.
  42. static cl::opt<bool>
  43. DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer",
  44. cl::desc("Disable load/store vectorizer"),
  45. cl::init(false), cl::Hidden);
  46. // TODO: Remove this flag when we are confident with no regressions.
  47. static cl::opt<bool> DisableRequireStructuredCFG(
  48. "disable-nvptx-require-structured-cfg",
  49. cl::desc("Transitional flag to turn off NVPTX's requirement on preserving "
  50. "structured CFG. The requirement should be disabled only when "
  51. "unexpected regressions happen."),
  52. cl::init(false), cl::Hidden);
  53. static cl::opt<bool> UseShortPointersOpt(
  54. "nvptx-short-ptr",
  55. cl::desc(
  56. "Use 32-bit pointers for accessing const/local/shared address spaces."),
  57. cl::init(false), cl::Hidden);
  58. namespace llvm {
  59. void initializeNVVMIntrRangePass(PassRegistry&);
  60. void initializeNVVMReflectPass(PassRegistry&);
  61. void initializeGenericToNVVMPass(PassRegistry&);
  62. void initializeNVPTXAllocaHoistingPass(PassRegistry &);
  63. void initializeNVPTXAtomicLowerPass(PassRegistry &);
  64. void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
  65. void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
  66. void initializeNVPTXLowerArgsPass(PassRegistry &);
  67. void initializeNVPTXLowerAllocaPass(PassRegistry &);
  68. void initializeNVPTXProxyRegErasurePass(PassRegistry &);
  69. } // end namespace llvm
  70. extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
  71. // Register the target.
  72. RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32());
  73. RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64());
  74. // FIXME: This pass is really intended to be invoked during IR optimization,
  75. // but it's very NVPTX-specific.
  76. PassRegistry &PR = *PassRegistry::getPassRegistry();
  77. initializeNVVMReflectPass(PR);
  78. initializeNVVMIntrRangePass(PR);
  79. initializeGenericToNVVMPass(PR);
  80. initializeNVPTXAllocaHoistingPass(PR);
  81. initializeNVPTXAssignValidGlobalNamesPass(PR);
  82. initializeNVPTXAtomicLowerPass(PR);
  83. initializeNVPTXLowerArgsPass(PR);
  84. initializeNVPTXLowerAllocaPass(PR);
  85. initializeNVPTXLowerAggrCopiesPass(PR);
  86. initializeNVPTXProxyRegErasurePass(PR);
  87. }
  88. static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {
  89. std::string Ret = "e";
  90. if (!is64Bit)
  91. Ret += "-p:32:32";
  92. else if (UseShortPointers)
  93. Ret += "-p3:32:32-p4:32:32-p5:32:32";
  94. Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64";
  95. return Ret;
  96. }
  97. NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
  98. StringRef CPU, StringRef FS,
  99. const TargetOptions &Options,
  100. Optional<Reloc::Model> RM,
  101. Optional<CodeModel::Model> CM,
  102. CodeGenOpt::Level OL, bool is64bit)
  103. // The pic relocation model is used regardless of what the client has
  104. // specified, as it is the only relocation model currently supported.
  105. : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT,
  106. CPU, FS, Options, Reloc::PIC_,
  107. getEffectiveCodeModel(CM, CodeModel::Small), OL),
  108. is64bit(is64bit), UseShortPointers(UseShortPointersOpt),
  109. TLOF(std::make_unique<NVPTXTargetObjectFile>()),
  110. Subtarget(TT, std::string(CPU), std::string(FS), *this) {
  111. if (TT.getOS() == Triple::NVCL)
  112. drvInterface = NVPTX::NVCL;
  113. else
  114. drvInterface = NVPTX::CUDA;
  115. if (!DisableRequireStructuredCFG)
  116. setRequiresStructuredCFG(true);
  117. initAsmInfo();
  118. }
  119. NVPTXTargetMachine::~NVPTXTargetMachine() = default;
  120. void NVPTXTargetMachine32::anchor() {}
  121. NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT,
  122. StringRef CPU, StringRef FS,
  123. const TargetOptions &Options,
  124. Optional<Reloc::Model> RM,
  125. Optional<CodeModel::Model> CM,
  126. CodeGenOpt::Level OL, bool JIT)
  127. : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
  128. void NVPTXTargetMachine64::anchor() {}
  129. NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT,
  130. StringRef CPU, StringRef FS,
  131. const TargetOptions &Options,
  132. Optional<Reloc::Model> RM,
  133. Optional<CodeModel::Model> CM,
  134. CodeGenOpt::Level OL, bool JIT)
  135. : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
  136. namespace {
  137. class NVPTXPassConfig : public TargetPassConfig {
  138. public:
  139. NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM)
  140. : TargetPassConfig(TM, PM) {}
  141. NVPTXTargetMachine &getNVPTXTargetMachine() const {
  142. return getTM<NVPTXTargetMachine>();
  143. }
  144. void addIRPasses() override;
  145. bool addInstSelector() override;
  146. void addPreRegAlloc() override;
  147. void addPostRegAlloc() override;
  148. void addMachineSSAOptimization() override;
  149. FunctionPass *createTargetRegisterAllocator(bool) override;
  150. void addFastRegAlloc() override;
  151. void addOptimizedRegAlloc() override;
  152. bool addRegAssignAndRewriteFast() override {
  153. llvm_unreachable("should not be used");
  154. }
  155. bool addRegAssignAndRewriteOptimized() override {
  156. llvm_unreachable("should not be used");
  157. }
  158. private:
  159. // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This
  160. // function is only called in opt mode.
  161. void addEarlyCSEOrGVNPass();
  162. // Add passes that propagate special memory spaces.
  163. void addAddressSpaceInferencePasses();
  164. // Add passes that perform straight-line scalar optimizations.
  165. void addStraightLineScalarOptimizationPasses();
  166. };
  167. } // end anonymous namespace
  168. TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
  169. return new NVPTXPassConfig(*this, PM);
  170. }
  171. void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
  172. Builder.addExtension(
  173. PassManagerBuilder::EP_EarlyAsPossible,
  174. [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
  175. PM.add(createNVVMReflectPass(Subtarget.getSmVersion()));
  176. PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion()));
  177. });
  178. }
  179. void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
  180. PB.registerPipelineParsingCallback(
  181. [](StringRef PassName, FunctionPassManager &PM,
  182. ArrayRef<PassBuilder::PipelineElement>) {
  183. if (PassName == "nvvm-reflect") {
  184. PM.addPass(NVVMReflectPass());
  185. return true;
  186. }
  187. if (PassName == "nvvm-intr-range") {
  188. PM.addPass(NVVMIntrRangePass());
  189. return true;
  190. }
  191. return false;
  192. });
  193. PB.registerPipelineStartEPCallback(
  194. [this](ModulePassManager &PM, OptimizationLevel Level) {
  195. FunctionPassManager FPM;
  196. FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
  197. // FIXME: NVVMIntrRangePass is causing numerical discrepancies,
  198. // investigate and re-enable.
  199. // FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion()));
  200. PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
  201. });
  202. }
  203. TargetTransformInfo
  204. NVPTXTargetMachine::getTargetTransformInfo(const Function &F) {
  205. return TargetTransformInfo(NVPTXTTIImpl(this, F));
  206. }
  207. std::pair<const Value *, unsigned>
  208. NVPTXTargetMachine::getPredicatedAddrSpace(const Value *V) const {
  209. if (auto *II = dyn_cast<IntrinsicInst>(V)) {
  210. switch (II->getIntrinsicID()) {
  211. case Intrinsic::nvvm_isspacep_const:
  212. return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_CONST);
  213. case Intrinsic::nvvm_isspacep_global:
  214. return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_GLOBAL);
  215. case Intrinsic::nvvm_isspacep_local:
  216. return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_LOCAL);
  217. case Intrinsic::nvvm_isspacep_shared:
  218. return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_SHARED);
  219. default:
  220. break;
  221. }
  222. }
  223. return std::make_pair(nullptr, -1);
  224. }
  225. void NVPTXPassConfig::addEarlyCSEOrGVNPass() {
  226. if (getOptLevel() == CodeGenOpt::Aggressive)
  227. addPass(createGVNPass());
  228. else
  229. addPass(createEarlyCSEPass());
  230. }
  231. void NVPTXPassConfig::addAddressSpaceInferencePasses() {
  232. // NVPTXLowerArgs emits alloca for byval parameters which can often
  233. // be eliminated by SROA.
  234. addPass(createSROAPass());
  235. addPass(createNVPTXLowerAllocaPass());
  236. addPass(createInferAddressSpacesPass());
  237. addPass(createNVPTXAtomicLowerPass());
  238. }
  239. void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() {
  240. addPass(createSeparateConstOffsetFromGEPPass());
  241. addPass(createSpeculativeExecutionPass());
  242. // ReassociateGEPs exposes more opportunites for SLSR. See
  243. // the example in reassociate-geps-and-slsr.ll.
  244. addPass(createStraightLineStrengthReducePass());
  245. // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
  246. // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE
  247. // for some of our benchmarks.
  248. addEarlyCSEOrGVNPass();
  249. // Run NaryReassociate after EarlyCSE/GVN to be more effective.
  250. addPass(createNaryReassociatePass());
  251. // NaryReassociate on GEPs creates redundant common expressions, so run
  252. // EarlyCSE after it.
  253. addPass(createEarlyCSEPass());
  254. }
  255. void NVPTXPassConfig::addIRPasses() {
  256. // The following passes are known to not play well with virtual regs hanging
  257. // around after register allocation (which in our case, is *all* registers).
  258. // We explicitly disable them here. We do, however, need some functionality
  259. // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
  260. // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
  261. disablePass(&PrologEpilogCodeInserterID);
  262. disablePass(&MachineCopyPropagationID);
  263. disablePass(&TailDuplicateID);
  264. disablePass(&StackMapLivenessID);
  265. disablePass(&LiveDebugValuesID);
  266. disablePass(&PostRAMachineSinkingID);
  267. disablePass(&PostRASchedulerID);
  268. disablePass(&FuncletLayoutID);
  269. disablePass(&PatchableFunctionID);
  270. disablePass(&ShrinkWrapID);
  271. // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running
  272. // it here does nothing. But since we need it for correctness when lowering
  273. // to NVPTX, run it here too, in case whoever built our pass pipeline didn't
  274. // call addEarlyAsPossiblePasses.
  275. const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();
  276. addPass(createNVVMReflectPass(ST.getSmVersion()));
  277. if (getOptLevel() != CodeGenOpt::None)
  278. addPass(createNVPTXImageOptimizerPass());
  279. addPass(createNVPTXAssignValidGlobalNamesPass());
  280. addPass(createGenericToNVVMPass());
  281. // NVPTXLowerArgs is required for correctness and should be run right
  282. // before the address space inference passes.
  283. addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine()));
  284. if (getOptLevel() != CodeGenOpt::None) {
  285. addAddressSpaceInferencePasses();
  286. addStraightLineScalarOptimizationPasses();
  287. }
  288. // === LSR and other generic IR passes ===
  289. TargetPassConfig::addIRPasses();
  290. // EarlyCSE is not always strong enough to clean up what LSR produces. For
  291. // example, GVN can combine
  292. //
  293. // %0 = add %a, %b
  294. // %1 = add %b, %a
  295. //
  296. // and
  297. //
  298. // %0 = shl nsw %a, 2
  299. // %1 = shl %a, 2
  300. //
  301. // but EarlyCSE can do neither of them.
  302. if (getOptLevel() != CodeGenOpt::None) {
  303. addEarlyCSEOrGVNPass();
  304. if (!DisableLoadStoreVectorizer)
  305. addPass(createLoadStoreVectorizerPass());
  306. addPass(createSROAPass());
  307. }
  308. }
  309. bool NVPTXPassConfig::addInstSelector() {
  310. const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();
  311. addPass(createLowerAggrCopies());
  312. addPass(createAllocaHoisting());
  313. addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
  314. if (!ST.hasImageHandles())
  315. addPass(createNVPTXReplaceImageHandlesPass());
  316. return false;
  317. }
  318. void NVPTXPassConfig::addPreRegAlloc() {
  319. // Remove Proxy Register pseudo instructions used to keep `callseq_end` alive.
  320. addPass(createNVPTXProxyRegErasurePass());
  321. }
  322. void NVPTXPassConfig::addPostRegAlloc() {
  323. addPass(createNVPTXPrologEpilogPass());
  324. if (getOptLevel() != CodeGenOpt::None) {
  325. // NVPTXPrologEpilogPass calculates frame object offset and replace frame
  326. // index with VRFrame register. NVPTXPeephole need to be run after that and
  327. // will replace VRFrame with VRFrameLocal when possible.
  328. addPass(createNVPTXPeephole());
  329. }
  330. }
  331. FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
  332. return nullptr; // No reg alloc
  333. }
  334. void NVPTXPassConfig::addFastRegAlloc() {
  335. addPass(&PHIEliminationID);
  336. addPass(&TwoAddressInstructionPassID);
  337. }
  338. void NVPTXPassConfig::addOptimizedRegAlloc() {
  339. addPass(&ProcessImplicitDefsID);
  340. addPass(&LiveVariablesID);
  341. addPass(&MachineLoopInfoID);
  342. addPass(&PHIEliminationID);
  343. addPass(&TwoAddressInstructionPassID);
  344. addPass(&RegisterCoalescerID);
  345. // PreRA instruction scheduling.
  346. if (addPass(&MachineSchedulerID))
  347. printAndVerify("After Machine Scheduling");
  348. addPass(&StackSlotColoringID);
  349. // FIXME: Needs physical registers
  350. //addPass(&MachineLICMID);
  351. printAndVerify("After StackSlotColoring");
  352. }
  353. void NVPTXPassConfig::addMachineSSAOptimization() {
  354. // Pre-ra tail duplication.
  355. if (addPass(&EarlyTailDuplicateID))
  356. printAndVerify("After Pre-RegAlloc TailDuplicate");
  357. // Optimize PHIs before DCE: removing dead PHI cycles may make more
  358. // instructions dead.
  359. addPass(&OptimizePHIsID);
  360. // This pass merges large allocas. StackSlotColoring is a different pass
  361. // which merges spill slots.
  362. addPass(&StackColoringID);
  363. // If the target requests it, assign local variables to stack slots relative
  364. // to one another and simplify frame index references where possible.
  365. addPass(&LocalStackSlotAllocationID);
  366. // With optimization, dead code should already be eliminated. However
  367. // there is one known exception: lowered code for arguments that are only
  368. // used by tail calls, where the tail calls reuse the incoming stack
  369. // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
  370. addPass(&DeadMachineInstructionElimID);
  371. printAndVerify("After codegen DCE pass");
  372. // Allow targets to insert passes that improve instruction level parallelism,
  373. // like if-conversion. Such passes will typically need dominator trees and
  374. // loop info, just like LICM and CSE below.
  375. if (addILPOpts())
  376. printAndVerify("After ILP optimizations");
  377. addPass(&EarlyMachineLICMID);
  378. addPass(&MachineCSEID);
  379. addPass(&MachineSinkingID);
  380. printAndVerify("After Machine LICM, CSE and Sinking passes");
  381. addPass(&PeepholeOptimizerID);
  382. printAndVerify("After codegen peephole optimization pass");
  383. }