NVPTXTargetTransformInfo.cpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469
  1. //===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #include "NVPTXTargetTransformInfo.h"
  9. #include "NVPTXUtilities.h"
  10. #include "llvm/Analysis/LoopInfo.h"
  11. #include "llvm/Analysis/TargetTransformInfo.h"
  12. #include "llvm/Analysis/ValueTracking.h"
  13. #include "llvm/CodeGen/BasicTTIImpl.h"
  14. #include "llvm/CodeGen/CostTable.h"
  15. #include "llvm/CodeGen/TargetLowering.h"
  16. #include "llvm/IR/IntrinsicsNVPTX.h"
  17. #include "llvm/Support/Debug.h"
  18. #include <optional>
  19. using namespace llvm;
  20. #define DEBUG_TYPE "NVPTXtti"
  21. // Whether the given intrinsic reads threadIdx.x/y/z.
  22. static bool readsThreadIndex(const IntrinsicInst *II) {
  23. switch (II->getIntrinsicID()) {
  24. default: return false;
  25. case Intrinsic::nvvm_read_ptx_sreg_tid_x:
  26. case Intrinsic::nvvm_read_ptx_sreg_tid_y:
  27. case Intrinsic::nvvm_read_ptx_sreg_tid_z:
  28. return true;
  29. }
  30. }
  31. static bool readsLaneId(const IntrinsicInst *II) {
  32. return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid;
  33. }
  34. // Whether the given intrinsic is an atomic instruction in PTX.
  35. static bool isNVVMAtomic(const IntrinsicInst *II) {
  36. switch (II->getIntrinsicID()) {
  37. default: return false;
  38. case Intrinsic::nvvm_atomic_load_inc_32:
  39. case Intrinsic::nvvm_atomic_load_dec_32:
  40. case Intrinsic::nvvm_atomic_add_gen_f_cta:
  41. case Intrinsic::nvvm_atomic_add_gen_f_sys:
  42. case Intrinsic::nvvm_atomic_add_gen_i_cta:
  43. case Intrinsic::nvvm_atomic_add_gen_i_sys:
  44. case Intrinsic::nvvm_atomic_and_gen_i_cta:
  45. case Intrinsic::nvvm_atomic_and_gen_i_sys:
  46. case Intrinsic::nvvm_atomic_cas_gen_i_cta:
  47. case Intrinsic::nvvm_atomic_cas_gen_i_sys:
  48. case Intrinsic::nvvm_atomic_dec_gen_i_cta:
  49. case Intrinsic::nvvm_atomic_dec_gen_i_sys:
  50. case Intrinsic::nvvm_atomic_inc_gen_i_cta:
  51. case Intrinsic::nvvm_atomic_inc_gen_i_sys:
  52. case Intrinsic::nvvm_atomic_max_gen_i_cta:
  53. case Intrinsic::nvvm_atomic_max_gen_i_sys:
  54. case Intrinsic::nvvm_atomic_min_gen_i_cta:
  55. case Intrinsic::nvvm_atomic_min_gen_i_sys:
  56. case Intrinsic::nvvm_atomic_or_gen_i_cta:
  57. case Intrinsic::nvvm_atomic_or_gen_i_sys:
  58. case Intrinsic::nvvm_atomic_exch_gen_i_cta:
  59. case Intrinsic::nvvm_atomic_exch_gen_i_sys:
  60. case Intrinsic::nvvm_atomic_xor_gen_i_cta:
  61. case Intrinsic::nvvm_atomic_xor_gen_i_sys:
  62. return true;
  63. }
  64. }
  65. bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) {
  66. // Without inter-procedural analysis, we conservatively assume that arguments
  67. // to __device__ functions are divergent.
  68. if (const Argument *Arg = dyn_cast<Argument>(V))
  69. return !isKernelFunction(*Arg->getParent());
  70. if (const Instruction *I = dyn_cast<Instruction>(V)) {
  71. // Without pointer analysis, we conservatively assume values loaded from
  72. // generic or local address space are divergent.
  73. if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
  74. unsigned AS = LI->getPointerAddressSpace();
  75. return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL;
  76. }
  77. // Atomic instructions may cause divergence. Atomic instructions are
  78. // executed sequentially across all threads in a warp. Therefore, an earlier
  79. // executed thread may see different memory inputs than a later executed
  80. // thread. For example, suppose *a = 0 initially.
  81. //
  82. // atom.global.add.s32 d, [a], 1
  83. //
  84. // returns 0 for the first thread that enters the critical region, and 1 for
  85. // the second thread.
  86. if (I->isAtomic())
  87. return true;
  88. if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
  89. // Instructions that read threadIdx are obviously divergent.
  90. if (readsThreadIndex(II) || readsLaneId(II))
  91. return true;
  92. // Handle the NVPTX atomic intrinsics that cannot be represented as an
  93. // atomic IR instruction.
  94. if (isNVVMAtomic(II))
  95. return true;
  96. }
  97. // Conservatively consider the return value of function calls as divergent.
  98. // We could analyze callees with bodies more precisely using
  99. // inter-procedural analysis.
  100. if (isa<CallInst>(I))
  101. return true;
  102. }
  103. return false;
  104. }
  105. // Convert NVVM intrinsics to target-generic LLVM code where possible.
  106. static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
  107. // Each NVVM intrinsic we can simplify can be replaced with one of:
  108. //
  109. // * an LLVM intrinsic,
  110. // * an LLVM cast operation,
  111. // * an LLVM binary operation, or
  112. // * ad-hoc LLVM IR for the particular operation.
  113. // Some transformations are only valid when the module's
  114. // flush-denormals-to-zero (ftz) setting is true/false, whereas other
  115. // transformations are valid regardless of the module's ftz setting.
  116. enum FtzRequirementTy {
  117. FTZ_Any, // Any ftz setting is ok.
  118. FTZ_MustBeOn, // Transformation is valid only if ftz is on.
  119. FTZ_MustBeOff, // Transformation is valid only if ftz is off.
  120. };
  121. // Classes of NVVM intrinsics that can't be replaced one-to-one with a
  122. // target-generic intrinsic, cast op, or binary op but that we can nonetheless
  123. // simplify.
  124. enum SpecialCase {
  125. SPC_Reciprocal,
  126. };
  127. // SimplifyAction is a poor-man's variant (plus an additional flag) that
  128. // represents how to replace an NVVM intrinsic with target-generic LLVM IR.
  129. struct SimplifyAction {
  130. // Invariant: At most one of these Optionals has a value.
  131. std::optional<Intrinsic::ID> IID;
  132. std::optional<Instruction::CastOps> CastOp;
  133. std::optional<Instruction::BinaryOps> BinaryOp;
  134. std::optional<SpecialCase> Special;
  135. FtzRequirementTy FtzRequirement = FTZ_Any;
  136. // Denormal handling is guarded by different attributes depending on the
  137. // type (denormal-fp-math vs denormal-fp-math-f32), take note of halfs.
  138. bool IsHalfTy = false;
  139. SimplifyAction() = default;
  140. SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq,
  141. bool IsHalfTy = false)
  142. : IID(IID), FtzRequirement(FtzReq), IsHalfTy(IsHalfTy) {}
  143. // Cast operations don't have anything to do with FTZ, so we skip that
  144. // argument.
  145. SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}
  146. SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
  147. : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}
  148. SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
  149. : Special(Special), FtzRequirement(FtzReq) {}
  150. };
  151. // Try to generate a SimplifyAction describing how to replace our
  152. // IntrinsicInstr with target-generic LLVM IR.
  153. const SimplifyAction Action = [II]() -> SimplifyAction {
  154. switch (II->getIntrinsicID()) {
  155. // NVVM intrinsics that map directly to LLVM intrinsics.
  156. case Intrinsic::nvvm_ceil_d:
  157. return {Intrinsic::ceil, FTZ_Any};
  158. case Intrinsic::nvvm_ceil_f:
  159. return {Intrinsic::ceil, FTZ_MustBeOff};
  160. case Intrinsic::nvvm_ceil_ftz_f:
  161. return {Intrinsic::ceil, FTZ_MustBeOn};
  162. case Intrinsic::nvvm_fabs_d:
  163. return {Intrinsic::fabs, FTZ_Any};
  164. case Intrinsic::nvvm_fabs_f:
  165. return {Intrinsic::fabs, FTZ_MustBeOff};
  166. case Intrinsic::nvvm_fabs_ftz_f:
  167. return {Intrinsic::fabs, FTZ_MustBeOn};
  168. case Intrinsic::nvvm_floor_d:
  169. return {Intrinsic::floor, FTZ_Any};
  170. case Intrinsic::nvvm_floor_f:
  171. return {Intrinsic::floor, FTZ_MustBeOff};
  172. case Intrinsic::nvvm_floor_ftz_f:
  173. return {Intrinsic::floor, FTZ_MustBeOn};
  174. case Intrinsic::nvvm_fma_rn_d:
  175. return {Intrinsic::fma, FTZ_Any};
  176. case Intrinsic::nvvm_fma_rn_f:
  177. return {Intrinsic::fma, FTZ_MustBeOff};
  178. case Intrinsic::nvvm_fma_rn_ftz_f:
  179. return {Intrinsic::fma, FTZ_MustBeOn};
  180. case Intrinsic::nvvm_fma_rn_f16:
  181. return {Intrinsic::fma, FTZ_MustBeOff, true};
  182. case Intrinsic::nvvm_fma_rn_ftz_f16:
  183. return {Intrinsic::fma, FTZ_MustBeOn, true};
  184. case Intrinsic::nvvm_fma_rn_f16x2:
  185. return {Intrinsic::fma, FTZ_MustBeOff, true};
  186. case Intrinsic::nvvm_fma_rn_ftz_f16x2:
  187. return {Intrinsic::fma, FTZ_MustBeOn, true};
  188. case Intrinsic::nvvm_fmax_d:
  189. return {Intrinsic::maxnum, FTZ_Any};
  190. case Intrinsic::nvvm_fmax_f:
  191. return {Intrinsic::maxnum, FTZ_MustBeOff};
  192. case Intrinsic::nvvm_fmax_ftz_f:
  193. return {Intrinsic::maxnum, FTZ_MustBeOn};
  194. case Intrinsic::nvvm_fmax_nan_f:
  195. return {Intrinsic::maximum, FTZ_MustBeOff};
  196. case Intrinsic::nvvm_fmax_ftz_nan_f:
  197. return {Intrinsic::maximum, FTZ_MustBeOn};
  198. case Intrinsic::nvvm_fmax_f16:
  199. return {Intrinsic::maxnum, FTZ_MustBeOff, true};
  200. case Intrinsic::nvvm_fmax_ftz_f16:
  201. return {Intrinsic::maxnum, FTZ_MustBeOn, true};
  202. case Intrinsic::nvvm_fmax_f16x2:
  203. return {Intrinsic::maxnum, FTZ_MustBeOff, true};
  204. case Intrinsic::nvvm_fmax_ftz_f16x2:
  205. return {Intrinsic::maxnum, FTZ_MustBeOn, true};
  206. case Intrinsic::nvvm_fmax_nan_f16:
  207. return {Intrinsic::maximum, FTZ_MustBeOff, true};
  208. case Intrinsic::nvvm_fmax_ftz_nan_f16:
  209. return {Intrinsic::maximum, FTZ_MustBeOn, true};
  210. case Intrinsic::nvvm_fmax_nan_f16x2:
  211. return {Intrinsic::maximum, FTZ_MustBeOff, true};
  212. case Intrinsic::nvvm_fmax_ftz_nan_f16x2:
  213. return {Intrinsic::maximum, FTZ_MustBeOn, true};
  214. case Intrinsic::nvvm_fmin_d:
  215. return {Intrinsic::minnum, FTZ_Any};
  216. case Intrinsic::nvvm_fmin_f:
  217. return {Intrinsic::minnum, FTZ_MustBeOff};
  218. case Intrinsic::nvvm_fmin_ftz_f:
  219. return {Intrinsic::minnum, FTZ_MustBeOn};
  220. case Intrinsic::nvvm_fmin_nan_f:
  221. return {Intrinsic::minimum, FTZ_MustBeOff};
  222. case Intrinsic::nvvm_fmin_ftz_nan_f:
  223. return {Intrinsic::minimum, FTZ_MustBeOn};
  224. case Intrinsic::nvvm_fmin_f16:
  225. return {Intrinsic::minnum, FTZ_MustBeOff, true};
  226. case Intrinsic::nvvm_fmin_ftz_f16:
  227. return {Intrinsic::minnum, FTZ_MustBeOn, true};
  228. case Intrinsic::nvvm_fmin_f16x2:
  229. return {Intrinsic::minnum, FTZ_MustBeOff, true};
  230. case Intrinsic::nvvm_fmin_ftz_f16x2:
  231. return {Intrinsic::minnum, FTZ_MustBeOn, true};
  232. case Intrinsic::nvvm_fmin_nan_f16:
  233. return {Intrinsic::minimum, FTZ_MustBeOff, true};
  234. case Intrinsic::nvvm_fmin_ftz_nan_f16:
  235. return {Intrinsic::minimum, FTZ_MustBeOn, true};
  236. case Intrinsic::nvvm_fmin_nan_f16x2:
  237. return {Intrinsic::minimum, FTZ_MustBeOff, true};
  238. case Intrinsic::nvvm_fmin_ftz_nan_f16x2:
  239. return {Intrinsic::minimum, FTZ_MustBeOn, true};
  240. case Intrinsic::nvvm_round_d:
  241. return {Intrinsic::round, FTZ_Any};
  242. case Intrinsic::nvvm_round_f:
  243. return {Intrinsic::round, FTZ_MustBeOff};
  244. case Intrinsic::nvvm_round_ftz_f:
  245. return {Intrinsic::round, FTZ_MustBeOn};
  246. case Intrinsic::nvvm_sqrt_rn_d:
  247. return {Intrinsic::sqrt, FTZ_Any};
  248. case Intrinsic::nvvm_sqrt_f:
  249. // nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the
  250. // ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts
  251. // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are
  252. // the versions with explicit ftz-ness.
  253. return {Intrinsic::sqrt, FTZ_Any};
  254. case Intrinsic::nvvm_sqrt_rn_f:
  255. return {Intrinsic::sqrt, FTZ_MustBeOff};
  256. case Intrinsic::nvvm_sqrt_rn_ftz_f:
  257. return {Intrinsic::sqrt, FTZ_MustBeOn};
  258. case Intrinsic::nvvm_trunc_d:
  259. return {Intrinsic::trunc, FTZ_Any};
  260. case Intrinsic::nvvm_trunc_f:
  261. return {Intrinsic::trunc, FTZ_MustBeOff};
  262. case Intrinsic::nvvm_trunc_ftz_f:
  263. return {Intrinsic::trunc, FTZ_MustBeOn};
  264. // NVVM intrinsics that map to LLVM cast operations.
  265. //
  266. // Note that llvm's target-generic conversion operators correspond to the rz
  267. // (round to zero) versions of the nvvm conversion intrinsics, even though
  268. // most everything else here uses the rn (round to nearest even) nvvm ops.
  269. case Intrinsic::nvvm_d2i_rz:
  270. case Intrinsic::nvvm_f2i_rz:
  271. case Intrinsic::nvvm_d2ll_rz:
  272. case Intrinsic::nvvm_f2ll_rz:
  273. return {Instruction::FPToSI};
  274. case Intrinsic::nvvm_d2ui_rz:
  275. case Intrinsic::nvvm_f2ui_rz:
  276. case Intrinsic::nvvm_d2ull_rz:
  277. case Intrinsic::nvvm_f2ull_rz:
  278. return {Instruction::FPToUI};
  279. case Intrinsic::nvvm_i2d_rz:
  280. case Intrinsic::nvvm_i2f_rz:
  281. case Intrinsic::nvvm_ll2d_rz:
  282. case Intrinsic::nvvm_ll2f_rz:
  283. return {Instruction::SIToFP};
  284. case Intrinsic::nvvm_ui2d_rz:
  285. case Intrinsic::nvvm_ui2f_rz:
  286. case Intrinsic::nvvm_ull2d_rz:
  287. case Intrinsic::nvvm_ull2f_rz:
  288. return {Instruction::UIToFP};
  289. // NVVM intrinsics that map to LLVM binary ops.
  290. case Intrinsic::nvvm_add_rn_d:
  291. return {Instruction::FAdd, FTZ_Any};
  292. case Intrinsic::nvvm_add_rn_f:
  293. return {Instruction::FAdd, FTZ_MustBeOff};
  294. case Intrinsic::nvvm_add_rn_ftz_f:
  295. return {Instruction::FAdd, FTZ_MustBeOn};
  296. case Intrinsic::nvvm_mul_rn_d:
  297. return {Instruction::FMul, FTZ_Any};
  298. case Intrinsic::nvvm_mul_rn_f:
  299. return {Instruction::FMul, FTZ_MustBeOff};
  300. case Intrinsic::nvvm_mul_rn_ftz_f:
  301. return {Instruction::FMul, FTZ_MustBeOn};
  302. case Intrinsic::nvvm_div_rn_d:
  303. return {Instruction::FDiv, FTZ_Any};
  304. case Intrinsic::nvvm_div_rn_f:
  305. return {Instruction::FDiv, FTZ_MustBeOff};
  306. case Intrinsic::nvvm_div_rn_ftz_f:
  307. return {Instruction::FDiv, FTZ_MustBeOn};
  308. // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
  309. // need special handling.
  310. //
  311. // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just
  312. // as well.
  313. case Intrinsic::nvvm_rcp_rn_d:
  314. return {SPC_Reciprocal, FTZ_Any};
  315. case Intrinsic::nvvm_rcp_rn_f:
  316. return {SPC_Reciprocal, FTZ_MustBeOff};
  317. case Intrinsic::nvvm_rcp_rn_ftz_f:
  318. return {SPC_Reciprocal, FTZ_MustBeOn};
  319. // We do not currently simplify intrinsics that give an approximate
  320. // answer. These include:
  321. //
  322. // - nvvm_cos_approx_{f,ftz_f}
  323. // - nvvm_ex2_approx_{d,f,ftz_f}
  324. // - nvvm_lg2_approx_{d,f,ftz_f}
  325. // - nvvm_sin_approx_{f,ftz_f}
  326. // - nvvm_sqrt_approx_{f,ftz_f}
  327. // - nvvm_rsqrt_approx_{d,f,ftz_f}
  328. // - nvvm_div_approx_{ftz_d,ftz_f,f}
  329. // - nvvm_rcp_approx_ftz_d
  330. //
  331. // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
  332. // means that fastmath is enabled in the intrinsic. Unfortunately only
  333. // binary operators (currently) have a fastmath bit in SelectionDAG, so
  334. // this information gets lost and we can't select on it.
  335. //
  336. // TODO: div and rcp are lowered to a binary op, so these we could in
  337. // theory lower them to "fast fdiv".
  338. default:
  339. return {};
  340. }
  341. }();
  342. // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
  343. // can bail out now. (Notice that in the case that IID is not an NVVM
  344. // intrinsic, we don't have to look up any module metadata, as
  345. // FtzRequirementTy will be FTZ_Any.)
  346. if (Action.FtzRequirement != FTZ_Any) {
  347. // FIXME: Broken for f64
  348. DenormalMode Mode = II->getFunction()->getDenormalMode(
  349. Action.IsHalfTy ? APFloat::IEEEhalf() : APFloat::IEEEsingle());
  350. bool FtzEnabled = Mode.Output == DenormalMode::PreserveSign;
  351. if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
  352. return nullptr;
  353. }
  354. // Simplify to target-generic intrinsic.
  355. if (Action.IID) {
  356. SmallVector<Value *, 4> Args(II->args());
  357. // All the target-generic intrinsics currently of interest to us have one
  358. // type argument, equal to that of the nvvm intrinsic's argument.
  359. Type *Tys[] = {II->getArgOperand(0)->getType()};
  360. return CallInst::Create(
  361. Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args);
  362. }
  363. // Simplify to target-generic binary op.
  364. if (Action.BinaryOp)
  365. return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
  366. II->getArgOperand(1), II->getName());
  367. // Simplify to target-generic cast op.
  368. if (Action.CastOp)
  369. return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
  370. II->getName());
  371. // All that's left are the special cases.
  372. if (!Action.Special)
  373. return nullptr;
  374. switch (*Action.Special) {
  375. case SPC_Reciprocal:
  376. // Simplify reciprocal.
  377. return BinaryOperator::Create(
  378. Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
  379. II->getArgOperand(0), II->getName());
  380. }
  381. llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
  382. }
  383. std::optional<Instruction *>
  384. NVPTXTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
  385. if (Instruction *I = simplifyNvvmIntrinsic(&II, IC)) {
  386. return I;
  387. }
  388. return std::nullopt;
  389. }
  390. InstructionCost NVPTXTTIImpl::getArithmeticInstrCost(
  391. unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
  392. TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
  393. ArrayRef<const Value *> Args,
  394. const Instruction *CxtI) {
  395. // Legalize the type.
  396. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
  397. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  398. switch (ISD) {
  399. default:
  400. return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
  401. Op2Info);
  402. case ISD::ADD:
  403. case ISD::MUL:
  404. case ISD::XOR:
  405. case ISD::OR:
  406. case ISD::AND:
  407. // The machine code (SASS) simulates an i64 with two i32. Therefore, we
  408. // estimate that arithmetic operations on i64 are twice as expensive as
  409. // those on types that can fit into one machine register.
  410. if (LT.second.SimpleTy == MVT::i64)
  411. return 2 * LT.first;
  412. // Delegate other cases to the basic TTI.
  413. return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
  414. Op2Info);
  415. }
  416. }
  417. void NVPTXTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
  418. TTI::UnrollingPreferences &UP,
  419. OptimizationRemarkEmitter *ORE) {
  420. BaseT::getUnrollingPreferences(L, SE, UP, ORE);
  421. // Enable partial unrolling and runtime unrolling, but reduce the
  422. // threshold. This partially unrolls small loops which are often
  423. // unrolled by the PTX to SASS compiler and unrolling earlier can be
  424. // beneficial.
  425. UP.Partial = UP.Runtime = true;
  426. UP.PartialThreshold = UP.Threshold / 4;
  427. }
  428. void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
  429. TTI::PeelingPreferences &PP) {
  430. BaseT::getPeelingPreferences(L, SE, PP);
  431. }