PPCTargetTransformInfo.cpp 55 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470
  1. //===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #include "PPCTargetTransformInfo.h"
  9. #include "llvm/Analysis/CodeMetrics.h"
  10. #include "llvm/Analysis/TargetLibraryInfo.h"
  11. #include "llvm/Analysis/TargetTransformInfo.h"
  12. #include "llvm/CodeGen/BasicTTIImpl.h"
  13. #include "llvm/CodeGen/CostTable.h"
  14. #include "llvm/CodeGen/TargetLowering.h"
  15. #include "llvm/CodeGen/TargetSchedule.h"
  16. #include "llvm/IR/IntrinsicsPowerPC.h"
  17. #include "llvm/Support/CommandLine.h"
  18. #include "llvm/Support/Debug.h"
  19. #include "llvm/Support/KnownBits.h"
  20. #include "llvm/Transforms/InstCombine/InstCombiner.h"
  21. #include "llvm/Transforms/Utils/Local.h"
  22. using namespace llvm;
  23. #define DEBUG_TYPE "ppctti"
  24. static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
  25. cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
  26. // This is currently only used for the data prefetch pass
  27. static cl::opt<unsigned>
  28. CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64),
  29. cl::desc("The loop prefetch cache line size"));
  30. static cl::opt<bool>
  31. EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
  32. cl::desc("Enable using coldcc calling conv for cold "
  33. "internal functions"));
  34. static cl::opt<bool>
  35. LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden, cl::init(false),
  36. cl::desc("Do not add instruction count to lsr cost model"));
  37. // The latency of mtctr is only justified if there are more than 4
  38. // comparisons that will be removed as a result.
  39. static cl::opt<unsigned>
  40. SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
  41. cl::desc("Loops with a constant trip count smaller than "
  42. "this value will not use the count register."));
  43. //===----------------------------------------------------------------------===//
  44. //
  45. // PPC cost model.
  46. //
  47. //===----------------------------------------------------------------------===//
  48. TargetTransformInfo::PopcntSupportKind
  49. PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
  50. assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
  51. if (ST->hasPOPCNTD() != PPCSubtarget::POPCNTD_Unavailable && TyWidth <= 64)
  52. return ST->hasPOPCNTD() == PPCSubtarget::POPCNTD_Slow ?
  53. TTI::PSK_SlowHardware : TTI::PSK_FastHardware;
  54. return TTI::PSK_Software;
  55. }
  56. Optional<Instruction *>
  57. PPCTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
  58. Intrinsic::ID IID = II.getIntrinsicID();
  59. switch (IID) {
  60. default:
  61. break;
  62. case Intrinsic::ppc_altivec_lvx:
  63. case Intrinsic::ppc_altivec_lvxl:
  64. // Turn PPC lvx -> load if the pointer is known aligned.
  65. if (getOrEnforceKnownAlignment(
  66. II.getArgOperand(0), Align(16), IC.getDataLayout(), &II,
  67. &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
  68. Value *Ptr = IC.Builder.CreateBitCast(
  69. II.getArgOperand(0), PointerType::getUnqual(II.getType()));
  70. return new LoadInst(II.getType(), Ptr, "", false, Align(16));
  71. }
  72. break;
  73. case Intrinsic::ppc_vsx_lxvw4x:
  74. case Intrinsic::ppc_vsx_lxvd2x: {
  75. // Turn PPC VSX loads into normal loads.
  76. Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(0),
  77. PointerType::getUnqual(II.getType()));
  78. return new LoadInst(II.getType(), Ptr, Twine(""), false, Align(1));
  79. }
  80. case Intrinsic::ppc_altivec_stvx:
  81. case Intrinsic::ppc_altivec_stvxl:
  82. // Turn stvx -> store if the pointer is known aligned.
  83. if (getOrEnforceKnownAlignment(
  84. II.getArgOperand(1), Align(16), IC.getDataLayout(), &II,
  85. &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
  86. Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType());
  87. Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy);
  88. return new StoreInst(II.getArgOperand(0), Ptr, false, Align(16));
  89. }
  90. break;
  91. case Intrinsic::ppc_vsx_stxvw4x:
  92. case Intrinsic::ppc_vsx_stxvd2x: {
  93. // Turn PPC VSX stores into normal stores.
  94. Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType());
  95. Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy);
  96. return new StoreInst(II.getArgOperand(0), Ptr, false, Align(1));
  97. }
  98. case Intrinsic::ppc_altivec_vperm:
  99. // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
  100. // Note that ppc_altivec_vperm has a big-endian bias, so when creating
  101. // a vectorshuffle for little endian, we must undo the transformation
  102. // performed on vec_perm in altivec.h. That is, we must complement
  103. // the permutation mask with respect to 31 and reverse the order of
  104. // V1 and V2.
  105. if (Constant *Mask = dyn_cast<Constant>(II.getArgOperand(2))) {
  106. assert(cast<FixedVectorType>(Mask->getType())->getNumElements() == 16 &&
  107. "Bad type for intrinsic!");
  108. // Check that all of the elements are integer constants or undefs.
  109. bool AllEltsOk = true;
  110. for (unsigned i = 0; i != 16; ++i) {
  111. Constant *Elt = Mask->getAggregateElement(i);
  112. if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) {
  113. AllEltsOk = false;
  114. break;
  115. }
  116. }
  117. if (AllEltsOk) {
  118. // Cast the input vectors to byte vectors.
  119. Value *Op0 =
  120. IC.Builder.CreateBitCast(II.getArgOperand(0), Mask->getType());
  121. Value *Op1 =
  122. IC.Builder.CreateBitCast(II.getArgOperand(1), Mask->getType());
  123. Value *Result = UndefValue::get(Op0->getType());
  124. // Only extract each element once.
  125. Value *ExtractedElts[32];
  126. memset(ExtractedElts, 0, sizeof(ExtractedElts));
  127. for (unsigned i = 0; i != 16; ++i) {
  128. if (isa<UndefValue>(Mask->getAggregateElement(i)))
  129. continue;
  130. unsigned Idx =
  131. cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
  132. Idx &= 31; // Match the hardware behavior.
  133. if (DL.isLittleEndian())
  134. Idx = 31 - Idx;
  135. if (!ExtractedElts[Idx]) {
  136. Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0;
  137. Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1;
  138. ExtractedElts[Idx] = IC.Builder.CreateExtractElement(
  139. Idx < 16 ? Op0ToUse : Op1ToUse, IC.Builder.getInt32(Idx & 15));
  140. }
  141. // Insert this value into the result vector.
  142. Result = IC.Builder.CreateInsertElement(Result, ExtractedElts[Idx],
  143. IC.Builder.getInt32(i));
  144. }
  145. return CastInst::Create(Instruction::BitCast, Result, II.getType());
  146. }
  147. }
  148. break;
  149. }
  150. return None;
  151. }
  152. InstructionCost PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
  153. TTI::TargetCostKind CostKind) {
  154. if (DisablePPCConstHoist)
  155. return BaseT::getIntImmCost(Imm, Ty, CostKind);
  156. assert(Ty->isIntegerTy());
  157. unsigned BitSize = Ty->getPrimitiveSizeInBits();
  158. if (BitSize == 0)
  159. return ~0U;
  160. if (Imm == 0)
  161. return TTI::TCC_Free;
  162. if (Imm.getBitWidth() <= 64) {
  163. if (isInt<16>(Imm.getSExtValue()))
  164. return TTI::TCC_Basic;
  165. if (isInt<32>(Imm.getSExtValue())) {
  166. // A constant that can be materialized using lis.
  167. if ((Imm.getZExtValue() & 0xFFFF) == 0)
  168. return TTI::TCC_Basic;
  169. return 2 * TTI::TCC_Basic;
  170. }
  171. }
  172. return 4 * TTI::TCC_Basic;
  173. }
  174. InstructionCost PPCTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
  175. const APInt &Imm, Type *Ty,
  176. TTI::TargetCostKind CostKind) {
  177. if (DisablePPCConstHoist)
  178. return BaseT::getIntImmCostIntrin(IID, Idx, Imm, Ty, CostKind);
  179. assert(Ty->isIntegerTy());
  180. unsigned BitSize = Ty->getPrimitiveSizeInBits();
  181. if (BitSize == 0)
  182. return ~0U;
  183. switch (IID) {
  184. default:
  185. return TTI::TCC_Free;
  186. case Intrinsic::sadd_with_overflow:
  187. case Intrinsic::uadd_with_overflow:
  188. case Intrinsic::ssub_with_overflow:
  189. case Intrinsic::usub_with_overflow:
  190. if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue()))
  191. return TTI::TCC_Free;
  192. break;
  193. case Intrinsic::experimental_stackmap:
  194. if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
  195. return TTI::TCC_Free;
  196. break;
  197. case Intrinsic::experimental_patchpoint_void:
  198. case Intrinsic::experimental_patchpoint_i64:
  199. if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
  200. return TTI::TCC_Free;
  201. break;
  202. }
  203. return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind);
  204. }
  205. InstructionCost PPCTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
  206. const APInt &Imm, Type *Ty,
  207. TTI::TargetCostKind CostKind,
  208. Instruction *Inst) {
  209. if (DisablePPCConstHoist)
  210. return BaseT::getIntImmCostInst(Opcode, Idx, Imm, Ty, CostKind, Inst);
  211. assert(Ty->isIntegerTy());
  212. unsigned BitSize = Ty->getPrimitiveSizeInBits();
  213. if (BitSize == 0)
  214. return ~0U;
  215. unsigned ImmIdx = ~0U;
  216. bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
  217. ZeroFree = false;
  218. switch (Opcode) {
  219. default:
  220. return TTI::TCC_Free;
  221. case Instruction::GetElementPtr:
  222. // Always hoist the base address of a GetElementPtr. This prevents the
  223. // creation of new constants for every base constant that gets constant
  224. // folded with the offset.
  225. if (Idx == 0)
  226. return 2 * TTI::TCC_Basic;
  227. return TTI::TCC_Free;
  228. case Instruction::And:
  229. RunFree = true; // (for the rotate-and-mask instructions)
  230. LLVM_FALLTHROUGH;
  231. case Instruction::Add:
  232. case Instruction::Or:
  233. case Instruction::Xor:
  234. ShiftedFree = true;
  235. LLVM_FALLTHROUGH;
  236. case Instruction::Sub:
  237. case Instruction::Mul:
  238. case Instruction::Shl:
  239. case Instruction::LShr:
  240. case Instruction::AShr:
  241. ImmIdx = 1;
  242. break;
  243. case Instruction::ICmp:
  244. UnsignedFree = true;
  245. ImmIdx = 1;
  246. // Zero comparisons can use record-form instructions.
  247. LLVM_FALLTHROUGH;
  248. case Instruction::Select:
  249. ZeroFree = true;
  250. break;
  251. case Instruction::PHI:
  252. case Instruction::Call:
  253. case Instruction::Ret:
  254. case Instruction::Load:
  255. case Instruction::Store:
  256. break;
  257. }
  258. if (ZeroFree && Imm == 0)
  259. return TTI::TCC_Free;
  260. if (Idx == ImmIdx && Imm.getBitWidth() <= 64) {
  261. if (isInt<16>(Imm.getSExtValue()))
  262. return TTI::TCC_Free;
  263. if (RunFree) {
  264. if (Imm.getBitWidth() <= 32 &&
  265. (isShiftedMask_32(Imm.getZExtValue()) ||
  266. isShiftedMask_32(~Imm.getZExtValue())))
  267. return TTI::TCC_Free;
  268. if (ST->isPPC64() &&
  269. (isShiftedMask_64(Imm.getZExtValue()) ||
  270. isShiftedMask_64(~Imm.getZExtValue())))
  271. return TTI::TCC_Free;
  272. }
  273. if (UnsignedFree && isUInt<16>(Imm.getZExtValue()))
  274. return TTI::TCC_Free;
  275. if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0)
  276. return TTI::TCC_Free;
  277. }
  278. return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind);
  279. }
  280. // Check if the current Type is an MMA vector type. Valid MMA types are
  281. // v256i1 and v512i1 respectively.
  282. static bool isMMAType(Type *Ty) {
  283. return Ty->isVectorTy() && (Ty->getScalarSizeInBits() == 1) &&
  284. (Ty->getPrimitiveSizeInBits() > 128);
  285. }
  286. InstructionCost PPCTTIImpl::getUserCost(const User *U,
  287. ArrayRef<const Value *> Operands,
  288. TTI::TargetCostKind CostKind) {
  289. // We already implement getCastInstrCost and getMemoryOpCost where we perform
  290. // the vector adjustment there.
  291. if (isa<CastInst>(U) || isa<LoadInst>(U) || isa<StoreInst>(U))
  292. return BaseT::getUserCost(U, Operands, CostKind);
  293. if (U->getType()->isVectorTy()) {
  294. // Instructions that need to be split should cost more.
  295. std::pair<InstructionCost, MVT> LT =
  296. TLI->getTypeLegalizationCost(DL, U->getType());
  297. return LT.first * BaseT::getUserCost(U, Operands, CostKind);
  298. }
  299. return BaseT::getUserCost(U, Operands, CostKind);
  300. }
  301. // Determining the address of a TLS variable results in a function call in
  302. // certain TLS models.
  303. static bool memAddrUsesCTR(const Value *MemAddr, const PPCTargetMachine &TM,
  304. SmallPtrSetImpl<const Value *> &Visited) {
  305. // No need to traverse again if we already checked this operand.
  306. if (!Visited.insert(MemAddr).second)
  307. return false;
  308. const auto *GV = dyn_cast<GlobalValue>(MemAddr);
  309. if (!GV) {
  310. // Recurse to check for constants that refer to TLS global variables.
  311. if (const auto *CV = dyn_cast<Constant>(MemAddr))
  312. for (const auto &CO : CV->operands())
  313. if (memAddrUsesCTR(CO, TM, Visited))
  314. return true;
  315. return false;
  316. }
  317. if (!GV->isThreadLocal())
  318. return false;
  319. TLSModel::Model Model = TM.getTLSModel(GV);
  320. return Model == TLSModel::GeneralDynamic || Model == TLSModel::LocalDynamic;
  321. }
  322. bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo,
  323. SmallPtrSetImpl<const Value *> &Visited) {
  324. const PPCTargetMachine &TM = ST->getTargetMachine();
  325. // Loop through the inline asm constraints and look for something that
  326. // clobbers ctr.
  327. auto asmClobbersCTR = [](InlineAsm *IA) {
  328. InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
  329. for (const InlineAsm::ConstraintInfo &C : CIV) {
  330. if (C.Type != InlineAsm::isInput)
  331. for (const auto &Code : C.Codes)
  332. if (StringRef(Code).equals_insensitive("{ctr}"))
  333. return true;
  334. }
  335. return false;
  336. };
  337. auto isLargeIntegerTy = [](bool Is32Bit, Type *Ty) {
  338. if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
  339. return ITy->getBitWidth() > (Is32Bit ? 32U : 64U);
  340. return false;
  341. };
  342. auto supportedHalfPrecisionOp = [](Instruction *Inst) {
  343. switch (Inst->getOpcode()) {
  344. default:
  345. return false;
  346. case Instruction::FPTrunc:
  347. case Instruction::FPExt:
  348. case Instruction::Load:
  349. case Instruction::Store:
  350. case Instruction::FPToUI:
  351. case Instruction::UIToFP:
  352. case Instruction::FPToSI:
  353. case Instruction::SIToFP:
  354. return true;
  355. }
  356. };
  357. for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
  358. J != JE; ++J) {
  359. // There are no direct operations on half precision so assume that
  360. // anything with that type requires a call except for a few select
  361. // operations with Power9.
  362. if (Instruction *CurrInst = dyn_cast<Instruction>(J)) {
  363. for (const auto &Op : CurrInst->operands()) {
  364. if (Op->getType()->getScalarType()->isHalfTy() ||
  365. CurrInst->getType()->getScalarType()->isHalfTy())
  366. return !(ST->isISA3_0() && supportedHalfPrecisionOp(CurrInst));
  367. }
  368. }
  369. if (CallInst *CI = dyn_cast<CallInst>(J)) {
  370. // Inline ASM is okay, unless it clobbers the ctr register.
  371. if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledOperand())) {
  372. if (asmClobbersCTR(IA))
  373. return true;
  374. continue;
  375. }
  376. if (Function *F = CI->getCalledFunction()) {
  377. // Most intrinsics don't become function calls, but some might.
  378. // sin, cos, exp and log are always calls.
  379. unsigned Opcode = 0;
  380. if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
  381. switch (F->getIntrinsicID()) {
  382. default: continue;
  383. // If we have a call to loop_decrement or set_loop_iterations,
  384. // we're definitely using CTR.
  385. case Intrinsic::set_loop_iterations:
  386. case Intrinsic::loop_decrement:
  387. return true;
  388. // Binary operations on 128-bit value will use CTR.
  389. case Intrinsic::experimental_constrained_fadd:
  390. case Intrinsic::experimental_constrained_fsub:
  391. case Intrinsic::experimental_constrained_fmul:
  392. case Intrinsic::experimental_constrained_fdiv:
  393. case Intrinsic::experimental_constrained_frem:
  394. if (F->getType()->getScalarType()->isFP128Ty() ||
  395. F->getType()->getScalarType()->isPPC_FP128Ty())
  396. return true;
  397. break;
  398. case Intrinsic::experimental_constrained_fptosi:
  399. case Intrinsic::experimental_constrained_fptoui:
  400. case Intrinsic::experimental_constrained_sitofp:
  401. case Intrinsic::experimental_constrained_uitofp: {
  402. Type *SrcType = CI->getArgOperand(0)->getType()->getScalarType();
  403. Type *DstType = CI->getType()->getScalarType();
  404. if (SrcType->isPPC_FP128Ty() || DstType->isPPC_FP128Ty() ||
  405. isLargeIntegerTy(!TM.isPPC64(), SrcType) ||
  406. isLargeIntegerTy(!TM.isPPC64(), DstType))
  407. return true;
  408. break;
  409. }
  410. // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
  411. // because, although it does clobber the counter register, the
  412. // control can't then return to inside the loop unless there is also
  413. // an eh_sjlj_setjmp.
  414. case Intrinsic::eh_sjlj_setjmp:
  415. case Intrinsic::memcpy:
  416. case Intrinsic::memmove:
  417. case Intrinsic::memset:
  418. case Intrinsic::powi:
  419. case Intrinsic::log:
  420. case Intrinsic::log2:
  421. case Intrinsic::log10:
  422. case Intrinsic::exp:
  423. case Intrinsic::exp2:
  424. case Intrinsic::pow:
  425. case Intrinsic::sin:
  426. case Intrinsic::cos:
  427. case Intrinsic::experimental_constrained_powi:
  428. case Intrinsic::experimental_constrained_log:
  429. case Intrinsic::experimental_constrained_log2:
  430. case Intrinsic::experimental_constrained_log10:
  431. case Intrinsic::experimental_constrained_exp:
  432. case Intrinsic::experimental_constrained_exp2:
  433. case Intrinsic::experimental_constrained_pow:
  434. case Intrinsic::experimental_constrained_sin:
  435. case Intrinsic::experimental_constrained_cos:
  436. return true;
  437. case Intrinsic::copysign:
  438. if (CI->getArgOperand(0)->getType()->getScalarType()->
  439. isPPC_FP128Ty())
  440. return true;
  441. else
  442. continue; // ISD::FCOPYSIGN is never a library call.
  443. case Intrinsic::fmuladd:
  444. case Intrinsic::fma: Opcode = ISD::FMA; break;
  445. case Intrinsic::sqrt: Opcode = ISD::FSQRT; break;
  446. case Intrinsic::floor: Opcode = ISD::FFLOOR; break;
  447. case Intrinsic::ceil: Opcode = ISD::FCEIL; break;
  448. case Intrinsic::trunc: Opcode = ISD::FTRUNC; break;
  449. case Intrinsic::rint: Opcode = ISD::FRINT; break;
  450. case Intrinsic::lrint: Opcode = ISD::LRINT; break;
  451. case Intrinsic::llrint: Opcode = ISD::LLRINT; break;
  452. case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
  453. case Intrinsic::round: Opcode = ISD::FROUND; break;
  454. case Intrinsic::lround: Opcode = ISD::LROUND; break;
  455. case Intrinsic::llround: Opcode = ISD::LLROUND; break;
  456. case Intrinsic::minnum: Opcode = ISD::FMINNUM; break;
  457. case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break;
  458. case Intrinsic::experimental_constrained_fcmp:
  459. Opcode = ISD::STRICT_FSETCC;
  460. break;
  461. case Intrinsic::experimental_constrained_fcmps:
  462. Opcode = ISD::STRICT_FSETCCS;
  463. break;
  464. case Intrinsic::experimental_constrained_fma:
  465. Opcode = ISD::STRICT_FMA;
  466. break;
  467. case Intrinsic::experimental_constrained_sqrt:
  468. Opcode = ISD::STRICT_FSQRT;
  469. break;
  470. case Intrinsic::experimental_constrained_floor:
  471. Opcode = ISD::STRICT_FFLOOR;
  472. break;
  473. case Intrinsic::experimental_constrained_ceil:
  474. Opcode = ISD::STRICT_FCEIL;
  475. break;
  476. case Intrinsic::experimental_constrained_trunc:
  477. Opcode = ISD::STRICT_FTRUNC;
  478. break;
  479. case Intrinsic::experimental_constrained_rint:
  480. Opcode = ISD::STRICT_FRINT;
  481. break;
  482. case Intrinsic::experimental_constrained_lrint:
  483. Opcode = ISD::STRICT_LRINT;
  484. break;
  485. case Intrinsic::experimental_constrained_llrint:
  486. Opcode = ISD::STRICT_LLRINT;
  487. break;
  488. case Intrinsic::experimental_constrained_nearbyint:
  489. Opcode = ISD::STRICT_FNEARBYINT;
  490. break;
  491. case Intrinsic::experimental_constrained_round:
  492. Opcode = ISD::STRICT_FROUND;
  493. break;
  494. case Intrinsic::experimental_constrained_lround:
  495. Opcode = ISD::STRICT_LROUND;
  496. break;
  497. case Intrinsic::experimental_constrained_llround:
  498. Opcode = ISD::STRICT_LLROUND;
  499. break;
  500. case Intrinsic::experimental_constrained_minnum:
  501. Opcode = ISD::STRICT_FMINNUM;
  502. break;
  503. case Intrinsic::experimental_constrained_maxnum:
  504. Opcode = ISD::STRICT_FMAXNUM;
  505. break;
  506. case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO; break;
  507. case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO; break;
  508. }
  509. }
  510. // PowerPC does not use [US]DIVREM or other library calls for
  511. // operations on regular types which are not otherwise library calls
  512. // (i.e. soft float or atomics). If adapting for targets that do,
  513. // additional care is required here.
  514. LibFunc Func;
  515. if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
  516. LibInfo->getLibFunc(F->getName(), Func) &&
  517. LibInfo->hasOptimizedCodeGen(Func)) {
  518. // Non-read-only functions are never treated as intrinsics.
  519. if (!CI->onlyReadsMemory())
  520. return true;
  521. // Conversion happens only for FP calls.
  522. if (!CI->getArgOperand(0)->getType()->isFloatingPointTy())
  523. return true;
  524. switch (Func) {
  525. default: return true;
  526. case LibFunc_copysign:
  527. case LibFunc_copysignf:
  528. continue; // ISD::FCOPYSIGN is never a library call.
  529. case LibFunc_copysignl:
  530. return true;
  531. case LibFunc_fabs:
  532. case LibFunc_fabsf:
  533. case LibFunc_fabsl:
  534. continue; // ISD::FABS is never a library call.
  535. case LibFunc_sqrt:
  536. case LibFunc_sqrtf:
  537. case LibFunc_sqrtl:
  538. Opcode = ISD::FSQRT; break;
  539. case LibFunc_floor:
  540. case LibFunc_floorf:
  541. case LibFunc_floorl:
  542. Opcode = ISD::FFLOOR; break;
  543. case LibFunc_nearbyint:
  544. case LibFunc_nearbyintf:
  545. case LibFunc_nearbyintl:
  546. Opcode = ISD::FNEARBYINT; break;
  547. case LibFunc_ceil:
  548. case LibFunc_ceilf:
  549. case LibFunc_ceill:
  550. Opcode = ISD::FCEIL; break;
  551. case LibFunc_rint:
  552. case LibFunc_rintf:
  553. case LibFunc_rintl:
  554. Opcode = ISD::FRINT; break;
  555. case LibFunc_round:
  556. case LibFunc_roundf:
  557. case LibFunc_roundl:
  558. Opcode = ISD::FROUND; break;
  559. case LibFunc_trunc:
  560. case LibFunc_truncf:
  561. case LibFunc_truncl:
  562. Opcode = ISD::FTRUNC; break;
  563. case LibFunc_fmin:
  564. case LibFunc_fminf:
  565. case LibFunc_fminl:
  566. Opcode = ISD::FMINNUM; break;
  567. case LibFunc_fmax:
  568. case LibFunc_fmaxf:
  569. case LibFunc_fmaxl:
  570. Opcode = ISD::FMAXNUM; break;
  571. }
  572. }
  573. if (Opcode) {
  574. EVT EVTy =
  575. TLI->getValueType(DL, CI->getArgOperand(0)->getType(), true);
  576. if (EVTy == MVT::Other)
  577. return true;
  578. if (TLI->isOperationLegalOrCustom(Opcode, EVTy))
  579. continue;
  580. else if (EVTy.isVector() &&
  581. TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType()))
  582. continue;
  583. return true;
  584. }
  585. }
  586. return true;
  587. } else if ((J->getType()->getScalarType()->isFP128Ty() ||
  588. J->getType()->getScalarType()->isPPC_FP128Ty())) {
  589. // Most operations on f128 or ppc_f128 values become calls.
  590. return true;
  591. } else if (isa<FCmpInst>(J) &&
  592. J->getOperand(0)->getType()->getScalarType()->isFP128Ty()) {
  593. return true;
  594. } else if ((isa<FPTruncInst>(J) || isa<FPExtInst>(J)) &&
  595. (cast<CastInst>(J)->getSrcTy()->getScalarType()->isFP128Ty() ||
  596. cast<CastInst>(J)->getDestTy()->getScalarType()->isFP128Ty())) {
  597. return true;
  598. } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
  599. isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) {
  600. CastInst *CI = cast<CastInst>(J);
  601. if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
  602. CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
  603. isLargeIntegerTy(!TM.isPPC64(), CI->getSrcTy()->getScalarType()) ||
  604. isLargeIntegerTy(!TM.isPPC64(), CI->getDestTy()->getScalarType()))
  605. return true;
  606. } else if (isLargeIntegerTy(!TM.isPPC64(),
  607. J->getType()->getScalarType()) &&
  608. (J->getOpcode() == Instruction::UDiv ||
  609. J->getOpcode() == Instruction::SDiv ||
  610. J->getOpcode() == Instruction::URem ||
  611. J->getOpcode() == Instruction::SRem)) {
  612. return true;
  613. } else if (!TM.isPPC64() &&
  614. isLargeIntegerTy(false, J->getType()->getScalarType()) &&
  615. (J->getOpcode() == Instruction::Shl ||
  616. J->getOpcode() == Instruction::AShr ||
  617. J->getOpcode() == Instruction::LShr)) {
  618. // Only on PPC32, for 128-bit integers (specifically not 64-bit
  619. // integers), these might be runtime calls.
  620. return true;
  621. } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) {
  622. // On PowerPC, indirect jumps use the counter register.
  623. return true;
  624. } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
  625. if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
  626. return true;
  627. }
  628. // FREM is always a call.
  629. if (J->getOpcode() == Instruction::FRem)
  630. return true;
  631. if (ST->useSoftFloat()) {
  632. switch(J->getOpcode()) {
  633. case Instruction::FAdd:
  634. case Instruction::FSub:
  635. case Instruction::FMul:
  636. case Instruction::FDiv:
  637. case Instruction::FPTrunc:
  638. case Instruction::FPExt:
  639. case Instruction::FPToUI:
  640. case Instruction::FPToSI:
  641. case Instruction::UIToFP:
  642. case Instruction::SIToFP:
  643. case Instruction::FCmp:
  644. return true;
  645. }
  646. }
  647. for (Value *Operand : J->operands())
  648. if (memAddrUsesCTR(Operand, TM, Visited))
  649. return true;
  650. }
  651. return false;
  652. }
  653. bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
  654. AssumptionCache &AC,
  655. TargetLibraryInfo *LibInfo,
  656. HardwareLoopInfo &HWLoopInfo) {
  657. const PPCTargetMachine &TM = ST->getTargetMachine();
  658. TargetSchedModel SchedModel;
  659. SchedModel.init(ST);
  660. // Do not convert small short loops to CTR loop.
  661. unsigned ConstTripCount = SE.getSmallConstantTripCount(L);
  662. if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
  663. SmallPtrSet<const Value *, 32> EphValues;
  664. CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
  665. CodeMetrics Metrics;
  666. for (BasicBlock *BB : L->blocks())
  667. Metrics.analyzeBasicBlock(BB, *this, EphValues);
  668. // 6 is an approximate latency for the mtctr instruction.
  669. if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
  670. return false;
  671. }
  672. // We don't want to spill/restore the counter register, and so we don't
  673. // want to use the counter register if the loop contains calls.
  674. SmallPtrSet<const Value *, 4> Visited;
  675. for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
  676. I != IE; ++I)
  677. if (mightUseCTR(*I, LibInfo, Visited))
  678. return false;
  679. SmallVector<BasicBlock*, 4> ExitingBlocks;
  680. L->getExitingBlocks(ExitingBlocks);
  681. // If there is an exit edge known to be frequently taken,
  682. // we should not transform this loop.
  683. for (auto &BB : ExitingBlocks) {
  684. Instruction *TI = BB->getTerminator();
  685. if (!TI) continue;
  686. if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
  687. uint64_t TrueWeight = 0, FalseWeight = 0;
  688. if (!BI->isConditional() ||
  689. !BI->extractProfMetadata(TrueWeight, FalseWeight))
  690. continue;
  691. // If the exit path is more frequent than the loop path,
  692. // we return here without further analysis for this loop.
  693. bool TrueIsExit = !L->contains(BI->getSuccessor(0));
  694. if (( TrueIsExit && FalseWeight < TrueWeight) ||
  695. (!TrueIsExit && FalseWeight > TrueWeight))
  696. return false;
  697. }
  698. }
  699. // If an exit block has a PHI that accesses a TLS variable as one of the
  700. // incoming values from the loop, we cannot produce a CTR loop because the
  701. // address for that value will be computed in the loop.
  702. SmallVector<BasicBlock *, 4> ExitBlocks;
  703. L->getExitBlocks(ExitBlocks);
  704. for (auto &BB : ExitBlocks) {
  705. for (auto &PHI : BB->phis()) {
  706. for (int Idx = 0, EndIdx = PHI.getNumIncomingValues(); Idx < EndIdx;
  707. Idx++) {
  708. const BasicBlock *IncomingBB = PHI.getIncomingBlock(Idx);
  709. const Value *IncomingValue = PHI.getIncomingValue(Idx);
  710. if (L->contains(IncomingBB) &&
  711. memAddrUsesCTR(IncomingValue, TM, Visited))
  712. return false;
  713. }
  714. }
  715. }
  716. LLVMContext &C = L->getHeader()->getContext();
  717. HWLoopInfo.CountType = TM.isPPC64() ?
  718. Type::getInt64Ty(C) : Type::getInt32Ty(C);
  719. HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
  720. return true;
  721. }
  722. void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
  723. TTI::UnrollingPreferences &UP,
  724. OptimizationRemarkEmitter *ORE) {
  725. if (ST->getCPUDirective() == PPC::DIR_A2) {
  726. // The A2 is in-order with a deep pipeline, and concatenation unrolling
  727. // helps expose latency-hiding opportunities to the instruction scheduler.
  728. UP.Partial = UP.Runtime = true;
  729. // We unroll a lot on the A2 (hundreds of instructions), and the benefits
  730. // often outweigh the cost of a division to compute the trip count.
  731. UP.AllowExpensiveTripCount = true;
  732. }
  733. BaseT::getUnrollingPreferences(L, SE, UP, ORE);
  734. }
  735. void PPCTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
  736. TTI::PeelingPreferences &PP) {
  737. BaseT::getPeelingPreferences(L, SE, PP);
  738. }
  739. // This function returns true to allow using coldcc calling convention.
  740. // Returning true results in coldcc being used for functions which are cold at
  741. // all call sites when the callers of the functions are not calling any other
  742. // non coldcc functions.
  743. bool PPCTTIImpl::useColdCCForColdCall(Function &F) {
  744. return EnablePPCColdCC;
  745. }
  746. bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
  747. // On the A2, always unroll aggressively.
  748. if (ST->getCPUDirective() == PPC::DIR_A2)
  749. return true;
  750. return LoopHasReductions;
  751. }
  752. PPCTTIImpl::TTI::MemCmpExpansionOptions
  753. PPCTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
  754. TTI::MemCmpExpansionOptions Options;
  755. Options.LoadSizes = {8, 4, 2, 1};
  756. Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
  757. return Options;
  758. }
  759. bool PPCTTIImpl::enableInterleavedAccessVectorization() {
  760. return true;
  761. }
  762. unsigned PPCTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
  763. assert(ClassID == GPRRC || ClassID == FPRRC ||
  764. ClassID == VRRC || ClassID == VSXRC);
  765. if (ST->hasVSX()) {
  766. assert(ClassID == GPRRC || ClassID == VSXRC || ClassID == VRRC);
  767. return ClassID == VSXRC ? 64 : 32;
  768. }
  769. assert(ClassID == GPRRC || ClassID == FPRRC || ClassID == VRRC);
  770. return 32;
  771. }
  772. unsigned PPCTTIImpl::getRegisterClassForType(bool Vector, Type *Ty) const {
  773. if (Vector)
  774. return ST->hasVSX() ? VSXRC : VRRC;
  775. else if (Ty && (Ty->getScalarType()->isFloatTy() ||
  776. Ty->getScalarType()->isDoubleTy()))
  777. return ST->hasVSX() ? VSXRC : FPRRC;
  778. else if (Ty && (Ty->getScalarType()->isFP128Ty() ||
  779. Ty->getScalarType()->isPPC_FP128Ty()))
  780. return VRRC;
  781. else if (Ty && Ty->getScalarType()->isHalfTy())
  782. return VSXRC;
  783. else
  784. return GPRRC;
  785. }
  786. const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const {
  787. switch (ClassID) {
  788. default:
  789. llvm_unreachable("unknown register class");
  790. return "PPC::unknown register class";
  791. case GPRRC: return "PPC::GPRRC";
  792. case FPRRC: return "PPC::FPRRC";
  793. case VRRC: return "PPC::VRRC";
  794. case VSXRC: return "PPC::VSXRC";
  795. }
  796. }
  797. TypeSize
  798. PPCTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
  799. switch (K) {
  800. case TargetTransformInfo::RGK_Scalar:
  801. return TypeSize::getFixed(ST->isPPC64() ? 64 : 32);
  802. case TargetTransformInfo::RGK_FixedWidthVector:
  803. return TypeSize::getFixed(ST->hasAltivec() ? 128 : 0);
  804. case TargetTransformInfo::RGK_ScalableVector:
  805. return TypeSize::getScalable(0);
  806. }
  807. llvm_unreachable("Unsupported register kind");
  808. }
  809. unsigned PPCTTIImpl::getCacheLineSize() const {
  810. // Check first if the user specified a custom line size.
  811. if (CacheLineSize.getNumOccurrences() > 0)
  812. return CacheLineSize;
  813. // Starting with P7 we have a cache line size of 128.
  814. unsigned Directive = ST->getCPUDirective();
  815. // Assume that Future CPU has the same cache line size as the others.
  816. if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
  817. Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR10 ||
  818. Directive == PPC::DIR_PWR_FUTURE)
  819. return 128;
  820. // On other processors return a default of 64 bytes.
  821. return 64;
  822. }
  823. unsigned PPCTTIImpl::getPrefetchDistance() const {
  824. return 300;
  825. }
  826. unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
  827. unsigned Directive = ST->getCPUDirective();
  828. // The 440 has no SIMD support, but floating-point instructions
  829. // have a 5-cycle latency, so unroll by 5x for latency hiding.
  830. if (Directive == PPC::DIR_440)
  831. return 5;
  832. // The A2 has no SIMD support, but floating-point instructions
  833. // have a 6-cycle latency, so unroll by 6x for latency hiding.
  834. if (Directive == PPC::DIR_A2)
  835. return 6;
  836. // FIXME: For lack of any better information, do no harm...
  837. if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500)
  838. return 1;
  839. // For P7 and P8, floating-point instructions have a 6-cycle latency and
  840. // there are two execution units, so unroll by 12x for latency hiding.
  841. // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
  842. // FIXME: the same for P10 as previous gen until POWER10 scheduling is ready
  843. // Assume that future is the same as the others.
  844. if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
  845. Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR10 ||
  846. Directive == PPC::DIR_PWR_FUTURE)
  847. return 12;
  848. // For most things, modern systems have two execution units (and
  849. // out-of-order execution).
  850. return 2;
  851. }
  852. // Returns a cost adjustment factor to adjust the cost of vector instructions
  853. // on targets which there is overlap between the vector and scalar units,
  854. // thereby reducing the overall throughput of vector code wrt. scalar code.
  855. // An invalid instruction cost is returned if the type is an MMA vector type.
  856. InstructionCost PPCTTIImpl::vectorCostAdjustmentFactor(unsigned Opcode,
  857. Type *Ty1, Type *Ty2) {
  858. // If the vector type is of an MMA type (v256i1, v512i1), an invalid
  859. // instruction cost is returned. This is to signify to other cost computing
  860. // functions to return the maximum instruction cost in order to prevent any
  861. // opportunities for the optimizer to produce MMA types within the IR.
  862. if (isMMAType(Ty1))
  863. return InstructionCost::getInvalid();
  864. if (!ST->vectorsUseTwoUnits() || !Ty1->isVectorTy())
  865. return InstructionCost(1);
  866. std::pair<InstructionCost, MVT> LT1 = TLI->getTypeLegalizationCost(DL, Ty1);
  867. // If type legalization involves splitting the vector, we don't want to
  868. // double the cost at every step - only the last step.
  869. if (LT1.first != 1 || !LT1.second.isVector())
  870. return InstructionCost(1);
  871. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  872. if (TLI->isOperationExpand(ISD, LT1.second))
  873. return InstructionCost(1);
  874. if (Ty2) {
  875. std::pair<InstructionCost, MVT> LT2 = TLI->getTypeLegalizationCost(DL, Ty2);
  876. if (LT2.first != 1 || !LT2.second.isVector())
  877. return InstructionCost(1);
  878. }
  879. return InstructionCost(2);
  880. }
  881. InstructionCost PPCTTIImpl::getArithmeticInstrCost(
  882. unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
  883. TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
  884. TTI::OperandValueProperties Opd1PropInfo,
  885. TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
  886. const Instruction *CxtI) {
  887. assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
  888. InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Ty, nullptr);
  889. if (!CostFactor.isValid())
  890. return InstructionCost::getMax();
  891. // TODO: Handle more cost kinds.
  892. if (CostKind != TTI::TCK_RecipThroughput)
  893. return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
  894. Op2Info, Opd1PropInfo,
  895. Opd2PropInfo, Args, CxtI);
  896. // Fallback to the default implementation.
  897. InstructionCost Cost = BaseT::getArithmeticInstrCost(
  898. Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
  899. return Cost * CostFactor;
  900. }
  901. InstructionCost PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
  902. ArrayRef<int> Mask, int Index,
  903. Type *SubTp) {
  904. InstructionCost CostFactor =
  905. vectorCostAdjustmentFactor(Instruction::ShuffleVector, Tp, nullptr);
  906. if (!CostFactor.isValid())
  907. return InstructionCost::getMax();
  908. // Legalize the type.
  909. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
  910. // PPC, for both Altivec/VSX, support cheap arbitrary permutations
  911. // (at least in the sense that there need only be one non-loop-invariant
  912. // instruction). We need one such shuffle instruction for each actual
  913. // register (this is not true for arbitrary shuffles, but is true for the
  914. // structured types of shuffles covered by TTI::ShuffleKind).
  915. return LT.first * CostFactor;
  916. }
  917. InstructionCost PPCTTIImpl::getCFInstrCost(unsigned Opcode,
  918. TTI::TargetCostKind CostKind,
  919. const Instruction *I) {
  920. if (CostKind != TTI::TCK_RecipThroughput)
  921. return Opcode == Instruction::PHI ? 0 : 1;
  922. // Branches are assumed to be predicted.
  923. return 0;
  924. }
  925. InstructionCost PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
  926. Type *Src,
  927. TTI::CastContextHint CCH,
  928. TTI::TargetCostKind CostKind,
  929. const Instruction *I) {
  930. assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
  931. InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Dst, Src);
  932. if (!CostFactor.isValid())
  933. return InstructionCost::getMax();
  934. InstructionCost Cost =
  935. BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
  936. Cost *= CostFactor;
  937. // TODO: Allow non-throughput costs that aren't binary.
  938. if (CostKind != TTI::TCK_RecipThroughput)
  939. return Cost == 0 ? 0 : 1;
  940. return Cost;
  941. }
  942. InstructionCost PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
  943. Type *CondTy,
  944. CmpInst::Predicate VecPred,
  945. TTI::TargetCostKind CostKind,
  946. const Instruction *I) {
  947. InstructionCost CostFactor =
  948. vectorCostAdjustmentFactor(Opcode, ValTy, nullptr);
  949. if (!CostFactor.isValid())
  950. return InstructionCost::getMax();
  951. InstructionCost Cost =
  952. BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
  953. // TODO: Handle other cost kinds.
  954. if (CostKind != TTI::TCK_RecipThroughput)
  955. return Cost;
  956. return Cost * CostFactor;
  957. }
  958. InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
  959. unsigned Index) {
  960. assert(Val->isVectorTy() && "This must be a vector type");
  961. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  962. assert(ISD && "Invalid opcode");
  963. InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Val, nullptr);
  964. if (!CostFactor.isValid())
  965. return InstructionCost::getMax();
  966. InstructionCost Cost = BaseT::getVectorInstrCost(Opcode, Val, Index);
  967. Cost *= CostFactor;
  968. if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
  969. // Double-precision scalars are already located in index #0 (or #1 if LE).
  970. if (ISD == ISD::EXTRACT_VECTOR_ELT &&
  971. Index == (ST->isLittleEndian() ? 1 : 0))
  972. return 0;
  973. return Cost;
  974. } else if (Val->getScalarType()->isIntegerTy() && Index != -1U) {
  975. if (ST->hasP9Altivec()) {
  976. if (ISD == ISD::INSERT_VECTOR_ELT)
  977. // A move-to VSR and a permute/insert. Assume vector operation cost
  978. // for both (cost will be 2x on P9).
  979. return 2 * CostFactor;
  980. // It's an extract. Maybe we can do a cheap move-from VSR.
  981. unsigned EltSize = Val->getScalarSizeInBits();
  982. if (EltSize == 64) {
  983. unsigned MfvsrdIndex = ST->isLittleEndian() ? 1 : 0;
  984. if (Index == MfvsrdIndex)
  985. return 1;
  986. } else if (EltSize == 32) {
  987. unsigned MfvsrwzIndex = ST->isLittleEndian() ? 2 : 1;
  988. if (Index == MfvsrwzIndex)
  989. return 1;
  990. }
  991. // We need a vector extract (or mfvsrld). Assume vector operation cost.
  992. // The cost of the load constant for a vector extract is disregarded
  993. // (invariant, easily schedulable).
  994. return CostFactor;
  995. } else if (ST->hasDirectMove())
  996. // Assume permute has standard cost.
  997. // Assume move-to/move-from VSR have 2x standard cost.
  998. return 3;
  999. }
  1000. // Estimated cost of a load-hit-store delay. This was obtained
  1001. // experimentally as a minimum needed to prevent unprofitable
  1002. // vectorization for the paq8p benchmark. It may need to be
  1003. // raised further if other unprofitable cases remain.
  1004. unsigned LHSPenalty = 2;
  1005. if (ISD == ISD::INSERT_VECTOR_ELT)
  1006. LHSPenalty += 7;
  1007. // Vector element insert/extract with Altivec is very expensive,
  1008. // because they require store and reload with the attendant
  1009. // processor stall for load-hit-store. Until VSX is available,
  1010. // these need to be estimated as very costly.
  1011. if (ISD == ISD::EXTRACT_VECTOR_ELT ||
  1012. ISD == ISD::INSERT_VECTOR_ELT)
  1013. return LHSPenalty + Cost;
  1014. return Cost;
  1015. }
  1016. InstructionCost PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
  1017. MaybeAlign Alignment,
  1018. unsigned AddressSpace,
  1019. TTI::TargetCostKind CostKind,
  1020. const Instruction *I) {
  1021. InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Src, nullptr);
  1022. if (!CostFactor.isValid())
  1023. return InstructionCost::getMax();
  1024. if (TLI->getValueType(DL, Src, true) == MVT::Other)
  1025. return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
  1026. CostKind);
  1027. // Legalize the type.
  1028. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
  1029. assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
  1030. "Invalid Opcode");
  1031. InstructionCost Cost =
  1032. BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
  1033. // TODO: Handle other cost kinds.
  1034. if (CostKind != TTI::TCK_RecipThroughput)
  1035. return Cost;
  1036. Cost *= CostFactor;
  1037. bool IsAltivecType = ST->hasAltivec() &&
  1038. (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
  1039. LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
  1040. bool IsVSXType = ST->hasVSX() &&
  1041. (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
  1042. // VSX has 32b/64b load instructions. Legalization can handle loading of
  1043. // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and
  1044. // PPCTargetLowering can't compute the cost appropriately. So here we
  1045. // explicitly check this case.
  1046. unsigned MemBytes = Src->getPrimitiveSizeInBits();
  1047. if (Opcode == Instruction::Load && ST->hasVSX() && IsAltivecType &&
  1048. (MemBytes == 64 || (ST->hasP8Vector() && MemBytes == 32)))
  1049. return 1;
  1050. // Aligned loads and stores are easy.
  1051. unsigned SrcBytes = LT.second.getStoreSize();
  1052. if (!SrcBytes || !Alignment || *Alignment >= SrcBytes)
  1053. return Cost;
  1054. // If we can use the permutation-based load sequence, then this is also
  1055. // relatively cheap (not counting loop-invariant instructions): one load plus
  1056. // one permute (the last load in a series has extra cost, but we're
  1057. // neglecting that here). Note that on the P7, we could do unaligned loads
  1058. // for Altivec types using the VSX instructions, but that's more expensive
  1059. // than using the permutation-based load sequence. On the P8, that's no
  1060. // longer true.
  1061. if (Opcode == Instruction::Load && (!ST->hasP8Vector() && IsAltivecType) &&
  1062. *Alignment >= LT.second.getScalarType().getStoreSize())
  1063. return Cost + LT.first; // Add the cost of the permutations.
  1064. // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
  1065. // P7, unaligned vector loads are more expensive than the permutation-based
  1066. // load sequence, so that might be used instead, but regardless, the net cost
  1067. // is about the same (not counting loop-invariant instructions).
  1068. if (IsVSXType || (ST->hasVSX() && IsAltivecType))
  1069. return Cost;
  1070. // Newer PPC supports unaligned memory access.
  1071. if (TLI->allowsMisalignedMemoryAccesses(LT.second, 0))
  1072. return Cost;
  1073. // PPC in general does not support unaligned loads and stores. They'll need
  1074. // to be decomposed based on the alignment factor.
  1075. // Add the cost of each scalar load or store.
  1076. assert(Alignment);
  1077. Cost += LT.first * ((SrcBytes / Alignment->value()) - 1);
  1078. // For a vector type, there is also scalarization overhead (only for
  1079. // stores, loads are expanded using the vector-load + permutation sequence,
  1080. // which is much less expensive).
  1081. if (Src->isVectorTy() && Opcode == Instruction::Store)
  1082. for (int i = 0, e = cast<FixedVectorType>(Src)->getNumElements(); i < e;
  1083. ++i)
  1084. Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
  1085. return Cost;
  1086. }
  1087. InstructionCost PPCTTIImpl::getInterleavedMemoryOpCost(
  1088. unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
  1089. Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
  1090. bool UseMaskForCond, bool UseMaskForGaps) {
  1091. InstructionCost CostFactor =
  1092. vectorCostAdjustmentFactor(Opcode, VecTy, nullptr);
  1093. if (!CostFactor.isValid())
  1094. return InstructionCost::getMax();
  1095. if (UseMaskForCond || UseMaskForGaps)
  1096. return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
  1097. Alignment, AddressSpace, CostKind,
  1098. UseMaskForCond, UseMaskForGaps);
  1099. assert(isa<VectorType>(VecTy) &&
  1100. "Expect a vector type for interleaved memory op");
  1101. // Legalize the type.
  1102. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy);
  1103. // Firstly, the cost of load/store operation.
  1104. InstructionCost Cost = getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment),
  1105. AddressSpace, CostKind);
  1106. // PPC, for both Altivec/VSX, support cheap arbitrary permutations
  1107. // (at least in the sense that there need only be one non-loop-invariant
  1108. // instruction). For each result vector, we need one shuffle per incoming
  1109. // vector (except that the first shuffle can take two incoming vectors
  1110. // because it does not need to take itself).
  1111. Cost += Factor*(LT.first-1);
  1112. return Cost;
  1113. }
  1114. InstructionCost
  1115. PPCTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
  1116. TTI::TargetCostKind CostKind) {
  1117. return BaseT::getIntrinsicInstrCost(ICA, CostKind);
  1118. }
  1119. bool PPCTTIImpl::areTypesABICompatible(const Function *Caller,
  1120. const Function *Callee,
  1121. const ArrayRef<Type *> &Types) const {
  1122. // We need to ensure that argument promotion does not
  1123. // attempt to promote pointers to MMA types (__vector_pair
  1124. // and __vector_quad) since these types explicitly cannot be
  1125. // passed as arguments. Both of these types are larger than
  1126. // the 128-bit Altivec vectors and have a scalar size of 1 bit.
  1127. if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
  1128. return false;
  1129. return llvm::none_of(Types, [](Type *Ty) {
  1130. if (Ty->isSized())
  1131. return Ty->isIntOrIntVectorTy(1) && Ty->getPrimitiveSizeInBits() > 128;
  1132. return false;
  1133. });
  1134. }
  1135. bool PPCTTIImpl::canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
  1136. LoopInfo *LI, DominatorTree *DT,
  1137. AssumptionCache *AC, TargetLibraryInfo *LibInfo) {
  1138. // Process nested loops first.
  1139. for (Loop *I : *L)
  1140. if (canSaveCmp(I, BI, SE, LI, DT, AC, LibInfo))
  1141. return false; // Stop search.
  1142. HardwareLoopInfo HWLoopInfo(L);
  1143. if (!HWLoopInfo.canAnalyze(*LI))
  1144. return false;
  1145. if (!isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo))
  1146. return false;
  1147. if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT))
  1148. return false;
  1149. *BI = HWLoopInfo.ExitBranch;
  1150. return true;
  1151. }
  1152. bool PPCTTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
  1153. TargetTransformInfo::LSRCost &C2) {
  1154. // PowerPC default behaviour here is "instruction number 1st priority".
  1155. // If LsrNoInsnsCost is set, call default implementation.
  1156. if (!LsrNoInsnsCost)
  1157. return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls,
  1158. C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
  1159. std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, C2.NumIVMuls,
  1160. C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
  1161. else
  1162. return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
  1163. }
  1164. bool PPCTTIImpl::isNumRegsMajorCostOfLSR() {
  1165. return false;
  1166. }
  1167. bool PPCTTIImpl::shouldBuildRelLookupTables() const {
  1168. const PPCTargetMachine &TM = ST->getTargetMachine();
  1169. // XCOFF hasn't implemented lowerRelativeReference, disable non-ELF for now.
  1170. if (!TM.isELFv2ABI())
  1171. return false;
  1172. return BaseT::shouldBuildRelLookupTables();
  1173. }
  1174. bool PPCTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
  1175. MemIntrinsicInfo &Info) {
  1176. switch (Inst->getIntrinsicID()) {
  1177. case Intrinsic::ppc_altivec_lvx:
  1178. case Intrinsic::ppc_altivec_lvxl:
  1179. case Intrinsic::ppc_altivec_lvebx:
  1180. case Intrinsic::ppc_altivec_lvehx:
  1181. case Intrinsic::ppc_altivec_lvewx:
  1182. case Intrinsic::ppc_vsx_lxvd2x:
  1183. case Intrinsic::ppc_vsx_lxvw4x:
  1184. case Intrinsic::ppc_vsx_lxvd2x_be:
  1185. case Intrinsic::ppc_vsx_lxvw4x_be:
  1186. case Intrinsic::ppc_vsx_lxvl:
  1187. case Intrinsic::ppc_vsx_lxvll:
  1188. case Intrinsic::ppc_vsx_lxvp: {
  1189. Info.PtrVal = Inst->getArgOperand(0);
  1190. Info.ReadMem = true;
  1191. Info.WriteMem = false;
  1192. return true;
  1193. }
  1194. case Intrinsic::ppc_altivec_stvx:
  1195. case Intrinsic::ppc_altivec_stvxl:
  1196. case Intrinsic::ppc_altivec_stvebx:
  1197. case Intrinsic::ppc_altivec_stvehx:
  1198. case Intrinsic::ppc_altivec_stvewx:
  1199. case Intrinsic::ppc_vsx_stxvd2x:
  1200. case Intrinsic::ppc_vsx_stxvw4x:
  1201. case Intrinsic::ppc_vsx_stxvd2x_be:
  1202. case Intrinsic::ppc_vsx_stxvw4x_be:
  1203. case Intrinsic::ppc_vsx_stxvl:
  1204. case Intrinsic::ppc_vsx_stxvll:
  1205. case Intrinsic::ppc_vsx_stxvp: {
  1206. Info.PtrVal = Inst->getArgOperand(1);
  1207. Info.ReadMem = false;
  1208. Info.WriteMem = true;
  1209. return true;
  1210. }
  1211. default:
  1212. break;
  1213. }
  1214. return false;
  1215. }
  1216. bool PPCTTIImpl::hasActiveVectorLength(unsigned Opcode, Type *DataType,
  1217. Align Alignment) const {
  1218. // Only load and stores instructions can have variable vector length on Power.
  1219. if (Opcode != Instruction::Load && Opcode != Instruction::Store)
  1220. return false;
  1221. // Loads/stores with length instructions use bits 0-7 of the GPR operand and
  1222. // therefore cannot be used in 32-bit mode.
  1223. if ((!ST->hasP9Vector() && !ST->hasP10Vector()) || !ST->isPPC64())
  1224. return false;
  1225. if (isa<FixedVectorType>(DataType)) {
  1226. unsigned VecWidth = DataType->getPrimitiveSizeInBits();
  1227. return VecWidth == 128;
  1228. }
  1229. Type *ScalarTy = DataType->getScalarType();
  1230. if (ScalarTy->isPointerTy())
  1231. return true;
  1232. if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
  1233. return true;
  1234. if (!ScalarTy->isIntegerTy())
  1235. return false;
  1236. unsigned IntWidth = ScalarTy->getIntegerBitWidth();
  1237. return IntWidth == 8 || IntWidth == 16 || IntWidth == 32 || IntWidth == 64;
  1238. }
  1239. InstructionCost PPCTTIImpl::getVPMemoryOpCost(unsigned Opcode, Type *Src,
  1240. Align Alignment,
  1241. unsigned AddressSpace,
  1242. TTI::TargetCostKind CostKind,
  1243. const Instruction *I) {
  1244. InstructionCost Cost = BaseT::getVPMemoryOpCost(Opcode, Src, Alignment,
  1245. AddressSpace, CostKind, I);
  1246. if (TLI->getValueType(DL, Src, true) == MVT::Other)
  1247. return Cost;
  1248. // TODO: Handle other cost kinds.
  1249. if (CostKind != TTI::TCK_RecipThroughput)
  1250. return Cost;
  1251. assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
  1252. "Invalid Opcode");
  1253. auto *SrcVTy = dyn_cast<FixedVectorType>(Src);
  1254. assert(SrcVTy && "Expected a vector type for VP memory operations");
  1255. if (hasActiveVectorLength(Opcode, Src, Alignment)) {
  1256. std::pair<InstructionCost, MVT> LT =
  1257. TLI->getTypeLegalizationCost(DL, SrcVTy);
  1258. InstructionCost CostFactor =
  1259. vectorCostAdjustmentFactor(Opcode, Src, nullptr);
  1260. if (!CostFactor.isValid())
  1261. return InstructionCost::getMax();
  1262. InstructionCost Cost = LT.first * CostFactor;
  1263. assert(Cost.isValid() && "Expected valid cost");
  1264. // On P9 but not on P10, if the op is misaligned then it will cause a
  1265. // pipeline flush. Otherwise the VSX masked memops cost the same as unmasked
  1266. // ones.
  1267. const Align DesiredAlignment(16);
  1268. if (Alignment >= DesiredAlignment || ST->getCPUDirective() != PPC::DIR_PWR9)
  1269. return Cost;
  1270. // Since alignment may be under estimated, we try to compute the probability
  1271. // that the actual address is aligned to the desired boundary. For example
  1272. // an 8-byte aligned load is assumed to be actually 16-byte aligned half the
  1273. // time, while a 4-byte aligned load has a 25% chance of being 16-byte
  1274. // aligned.
  1275. float AlignmentProb = ((float)Alignment.value()) / DesiredAlignment.value();
  1276. float MisalignmentProb = 1.0 - AlignmentProb;
  1277. return (MisalignmentProb * P9PipelineFlushEstimate) +
  1278. (AlignmentProb * *Cost.getValue());
  1279. }
  1280. // Usually we should not get to this point, but the following is an attempt to
  1281. // model the cost of legalization. Currently we can only lower intrinsics with
  1282. // evl but no mask, on Power 9/10. Otherwise, we must scalarize.
  1283. return getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
  1284. }