BasicTTIImpl.h 92 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273
  1. #pragma once
  2. #ifdef __GNUC__
  3. #pragma GCC diagnostic push
  4. #pragma GCC diagnostic ignored "-Wunused-parameter"
  5. #endif
  6. //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
  7. //
  8. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  9. // See https://llvm.org/LICENSE.txt for license information.
  10. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  11. //
  12. //===----------------------------------------------------------------------===//
  13. //
  14. /// \file
  15. /// This file provides a helper that implements much of the TTI interface in
  16. /// terms of the target-independent code generator and TargetLowering
  17. /// interfaces.
  18. //
  19. //===----------------------------------------------------------------------===//
  20. #ifndef LLVM_CODEGEN_BASICTTIIMPL_H
  21. #define LLVM_CODEGEN_BASICTTIIMPL_H
  22. #include "llvm/ADT/APInt.h"
  23. #include "llvm/ADT/ArrayRef.h"
  24. #include "llvm/ADT/BitVector.h"
  25. #include "llvm/ADT/SmallPtrSet.h"
  26. #include "llvm/ADT/SmallVector.h"
  27. #include "llvm/Analysis/LoopInfo.h"
  28. #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  29. #include "llvm/Analysis/TargetTransformInfo.h"
  30. #include "llvm/Analysis/TargetTransformInfoImpl.h"
  31. #include "llvm/CodeGen/ISDOpcodes.h"
  32. #include "llvm/CodeGen/TargetLowering.h"
  33. #include "llvm/CodeGen/TargetSubtargetInfo.h"
  34. #include "llvm/CodeGen/ValueTypes.h"
  35. #include "llvm/IR/BasicBlock.h"
  36. #include "llvm/IR/Constant.h"
  37. #include "llvm/IR/Constants.h"
  38. #include "llvm/IR/DataLayout.h"
  39. #include "llvm/IR/DerivedTypes.h"
  40. #include "llvm/IR/InstrTypes.h"
  41. #include "llvm/IR/Instruction.h"
  42. #include "llvm/IR/Instructions.h"
  43. #include "llvm/IR/Intrinsics.h"
  44. #include "llvm/IR/Operator.h"
  45. #include "llvm/IR/Type.h"
  46. #include "llvm/IR/Value.h"
  47. #include "llvm/Support/Casting.h"
  48. #include "llvm/Support/CommandLine.h"
  49. #include "llvm/Support/ErrorHandling.h"
  50. #include "llvm/Support/MachineValueType.h"
  51. #include "llvm/Support/MathExtras.h"
  52. #include "llvm/Target/TargetMachine.h"
  53. #include <algorithm>
  54. #include <cassert>
  55. #include <cstdint>
  56. #include <limits>
  57. #include <utility>
  58. namespace llvm {
  59. class Function;
  60. class GlobalValue;
  61. class LLVMContext;
  62. class ScalarEvolution;
  63. class SCEV;
  64. class TargetMachine;
  65. extern cl::opt<unsigned> PartialUnrollingThreshold;
  66. /// Base class which can be used to help build a TTI implementation.
  67. ///
  68. /// This class provides as much implementation of the TTI interface as is
  69. /// possible using the target independent parts of the code generator.
  70. ///
  71. /// In order to subclass it, your class must implement a getST() method to
  72. /// return the subtarget, and a getTLI() method to return the target lowering.
  73. /// We need these methods implemented in the derived class so that this class
  74. /// doesn't have to duplicate storage for them.
  75. template <typename T>
  76. class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
  77. private:
  78. using BaseT = TargetTransformInfoImplCRTPBase<T>;
  79. using TTI = TargetTransformInfo;
  80. /// Helper function to access this as a T.
  81. T *thisT() { return static_cast<T *>(this); }
  82. /// Estimate a cost of Broadcast as an extract and sequence of insert
  83. /// operations.
  84. InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) {
  85. InstructionCost Cost = 0;
  86. // Broadcast cost is equal to the cost of extracting the zero'th element
  87. // plus the cost of inserting it into every element of the result vector.
  88. Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0);
  89. for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
  90. Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
  91. }
  92. return Cost;
  93. }
  94. /// Estimate a cost of shuffle as a sequence of extract and insert
  95. /// operations.
  96. InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy) {
  97. InstructionCost Cost = 0;
  98. // Shuffle cost is equal to the cost of extracting element from its argument
  99. // plus the cost of inserting them onto the result vector.
  100. // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
  101. // index 0 of first vector, index 1 of second vector,index 2 of first
  102. // vector and finally index 3 of second vector and insert them at index
  103. // <0,1,2,3> of result vector.
  104. for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
  105. Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
  106. Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i);
  107. }
  108. return Cost;
  109. }
  110. /// Estimate a cost of subvector extraction as a sequence of extract and
  111. /// insert operations.
  112. InstructionCost getExtractSubvectorOverhead(VectorType *VTy, int Index,
  113. FixedVectorType *SubVTy) {
  114. assert(VTy && SubVTy &&
  115. "Can only extract subvectors from vectors");
  116. int NumSubElts = SubVTy->getNumElements();
  117. assert((!isa<FixedVectorType>(VTy) ||
  118. (Index + NumSubElts) <=
  119. (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
  120. "SK_ExtractSubvector index out of range");
  121. InstructionCost Cost = 0;
  122. // Subvector extraction cost is equal to the cost of extracting element from
  123. // the source type plus the cost of inserting them into the result vector
  124. // type.
  125. for (int i = 0; i != NumSubElts; ++i) {
  126. Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
  127. i + Index);
  128. Cost +=
  129. thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i);
  130. }
  131. return Cost;
  132. }
  133. /// Estimate a cost of subvector insertion as a sequence of extract and
  134. /// insert operations.
  135. InstructionCost getInsertSubvectorOverhead(VectorType *VTy, int Index,
  136. FixedVectorType *SubVTy) {
  137. assert(VTy && SubVTy &&
  138. "Can only insert subvectors into vectors");
  139. int NumSubElts = SubVTy->getNumElements();
  140. assert((!isa<FixedVectorType>(VTy) ||
  141. (Index + NumSubElts) <=
  142. (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
  143. "SK_InsertSubvector index out of range");
  144. InstructionCost Cost = 0;
  145. // Subvector insertion cost is equal to the cost of extracting element from
  146. // the source type plus the cost of inserting them into the result vector
  147. // type.
  148. for (int i = 0; i != NumSubElts; ++i) {
  149. Cost +=
  150. thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i);
  151. Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
  152. i + Index);
  153. }
  154. return Cost;
  155. }
  156. /// Local query method delegates up to T which *must* implement this!
  157. const TargetSubtargetInfo *getST() const {
  158. return static_cast<const T *>(this)->getST();
  159. }
  160. /// Local query method delegates up to T which *must* implement this!
  161. const TargetLoweringBase *getTLI() const {
  162. return static_cast<const T *>(this)->getTLI();
  163. }
  164. static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
  165. switch (M) {
  166. case TTI::MIM_Unindexed:
  167. return ISD::UNINDEXED;
  168. case TTI::MIM_PreInc:
  169. return ISD::PRE_INC;
  170. case TTI::MIM_PreDec:
  171. return ISD::PRE_DEC;
  172. case TTI::MIM_PostInc:
  173. return ISD::POST_INC;
  174. case TTI::MIM_PostDec:
  175. return ISD::POST_DEC;
  176. }
  177. llvm_unreachable("Unexpected MemIndexedMode");
  178. }
  179. InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
  180. Align Alignment,
  181. bool VariableMask,
  182. bool IsGatherScatter,
  183. TTI::TargetCostKind CostKind) {
  184. auto *VT = cast<FixedVectorType>(DataTy);
  185. // Assume the target does not have support for gather/scatter operations
  186. // and provide a rough estimate.
  187. //
  188. // First, compute the cost of the individual memory operations.
  189. InstructionCost AddrExtractCost =
  190. IsGatherScatter
  191. ? getVectorInstrCost(Instruction::ExtractElement,
  192. FixedVectorType::get(
  193. PointerType::get(VT->getElementType(), 0),
  194. VT->getNumElements()),
  195. -1)
  196. : 0;
  197. InstructionCost LoadCost =
  198. VT->getNumElements() *
  199. (AddrExtractCost +
  200. getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
  201. // Next, compute the cost of packing the result in a vector.
  202. InstructionCost PackingCost = getScalarizationOverhead(
  203. VT, Opcode != Instruction::Store, Opcode == Instruction::Store);
  204. InstructionCost ConditionalCost = 0;
  205. if (VariableMask) {
  206. // Compute the cost of conditionally executing the memory operations with
  207. // variable masks. This includes extracting the individual conditions, a
  208. // branches and PHIs to combine the results.
  209. // NOTE: Estimating the cost of conditionally executing the memory
  210. // operations accurately is quite difficult and the current solution
  211. // provides a very rough estimate only.
  212. ConditionalCost =
  213. VT->getNumElements() *
  214. (getVectorInstrCost(
  215. Instruction::ExtractElement,
  216. FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
  217. VT->getNumElements()),
  218. -1) +
  219. getCFInstrCost(Instruction::Br, CostKind) +
  220. getCFInstrCost(Instruction::PHI, CostKind));
  221. }
  222. return LoadCost + PackingCost + ConditionalCost;
  223. }
  224. protected:
  225. explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
  226. : BaseT(DL) {}
  227. virtual ~BasicTTIImplBase() = default;
  228. using TargetTransformInfoImplBase::DL;
  229. public:
  230. /// \name Scalar TTI Implementations
  231. /// @{
  232. bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
  233. unsigned AddressSpace, Align Alignment,
  234. bool *Fast) const {
  235. EVT E = EVT::getIntegerVT(Context, BitWidth);
  236. return getTLI()->allowsMisalignedMemoryAccesses(
  237. E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
  238. }
  239. bool hasBranchDivergence() { return false; }
  240. bool useGPUDivergenceAnalysis() { return false; }
  241. bool isSourceOfDivergence(const Value *V) { return false; }
  242. bool isAlwaysUniform(const Value *V) { return false; }
  243. unsigned getFlatAddressSpace() {
  244. // Return an invalid address space.
  245. return -1;
  246. }
  247. bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
  248. Intrinsic::ID IID) const {
  249. return false;
  250. }
  251. bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
  252. return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
  253. }
  254. unsigned getAssumedAddrSpace(const Value *V) const {
  255. return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
  256. }
  257. std::pair<const Value *, unsigned>
  258. getPredicatedAddrSpace(const Value *V) const {
  259. return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
  260. }
  261. Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
  262. Value *NewV) const {
  263. return nullptr;
  264. }
  265. bool isLegalAddImmediate(int64_t imm) {
  266. return getTLI()->isLegalAddImmediate(imm);
  267. }
  268. bool isLegalICmpImmediate(int64_t imm) {
  269. return getTLI()->isLegalICmpImmediate(imm);
  270. }
  271. bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
  272. bool HasBaseReg, int64_t Scale,
  273. unsigned AddrSpace, Instruction *I = nullptr) {
  274. TargetLoweringBase::AddrMode AM;
  275. AM.BaseGV = BaseGV;
  276. AM.BaseOffs = BaseOffset;
  277. AM.HasBaseReg = HasBaseReg;
  278. AM.Scale = Scale;
  279. return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
  280. }
  281. bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
  282. const DataLayout &DL) const {
  283. EVT VT = getTLI()->getValueType(DL, Ty);
  284. return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
  285. }
  286. bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
  287. const DataLayout &DL) const {
  288. EVT VT = getTLI()->getValueType(DL, Ty);
  289. return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
  290. }
  291. bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
  292. return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
  293. }
  294. bool isNumRegsMajorCostOfLSR() {
  295. return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
  296. }
  297. bool isProfitableLSRChainElement(Instruction *I) {
  298. return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
  299. }
  300. InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
  301. int64_t BaseOffset, bool HasBaseReg,
  302. int64_t Scale, unsigned AddrSpace) {
  303. TargetLoweringBase::AddrMode AM;
  304. AM.BaseGV = BaseGV;
  305. AM.BaseOffs = BaseOffset;
  306. AM.HasBaseReg = HasBaseReg;
  307. AM.Scale = Scale;
  308. return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
  309. }
  310. bool isTruncateFree(Type *Ty1, Type *Ty2) {
  311. return getTLI()->isTruncateFree(Ty1, Ty2);
  312. }
  313. bool isProfitableToHoist(Instruction *I) {
  314. return getTLI()->isProfitableToHoist(I);
  315. }
  316. bool useAA() const { return getST()->useAA(); }
  317. bool isTypeLegal(Type *Ty) {
  318. EVT VT = getTLI()->getValueType(DL, Ty);
  319. return getTLI()->isTypeLegal(VT);
  320. }
  321. InstructionCost getRegUsageForType(Type *Ty) {
  322. InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first;
  323. assert(Val >= 0 && "Negative cost!");
  324. return Val;
  325. }
  326. InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
  327. ArrayRef<const Value *> Operands,
  328. TTI::TargetCostKind CostKind) {
  329. return BaseT::getGEPCost(PointeeType, Ptr, Operands, CostKind);
  330. }
  331. unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
  332. unsigned &JumpTableSize,
  333. ProfileSummaryInfo *PSI,
  334. BlockFrequencyInfo *BFI) {
  335. /// Try to find the estimated number of clusters. Note that the number of
  336. /// clusters identified in this function could be different from the actual
  337. /// numbers found in lowering. This function ignore switches that are
  338. /// lowered with a mix of jump table / bit test / BTree. This function was
  339. /// initially intended to be used when estimating the cost of switch in
  340. /// inline cost heuristic, but it's a generic cost model to be used in other
  341. /// places (e.g., in loop unrolling).
  342. unsigned N = SI.getNumCases();
  343. const TargetLoweringBase *TLI = getTLI();
  344. const DataLayout &DL = this->getDataLayout();
  345. JumpTableSize = 0;
  346. bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
  347. // Early exit if both a jump table and bit test are not allowed.
  348. if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
  349. return N;
  350. APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
  351. APInt MinCaseVal = MaxCaseVal;
  352. for (auto CI : SI.cases()) {
  353. const APInt &CaseVal = CI.getCaseValue()->getValue();
  354. if (CaseVal.sgt(MaxCaseVal))
  355. MaxCaseVal = CaseVal;
  356. if (CaseVal.slt(MinCaseVal))
  357. MinCaseVal = CaseVal;
  358. }
  359. // Check if suitable for a bit test
  360. if (N <= DL.getIndexSizeInBits(0u)) {
  361. SmallPtrSet<const BasicBlock *, 4> Dests;
  362. for (auto I : SI.cases())
  363. Dests.insert(I.getCaseSuccessor());
  364. if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
  365. DL))
  366. return 1;
  367. }
  368. // Check if suitable for a jump table.
  369. if (IsJTAllowed) {
  370. if (N < 2 || N < TLI->getMinimumJumpTableEntries())
  371. return N;
  372. uint64_t Range =
  373. (MaxCaseVal - MinCaseVal)
  374. .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
  375. // Check whether a range of clusters is dense enough for a jump table
  376. if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
  377. JumpTableSize = Range;
  378. return 1;
  379. }
  380. }
  381. return N;
  382. }
  383. bool shouldBuildLookupTables() {
  384. const TargetLoweringBase *TLI = getTLI();
  385. return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
  386. TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
  387. }
  388. bool shouldBuildRelLookupTables() const {
  389. const TargetMachine &TM = getTLI()->getTargetMachine();
  390. // If non-PIC mode, do not generate a relative lookup table.
  391. if (!TM.isPositionIndependent())
  392. return false;
  393. /// Relative lookup table entries consist of 32-bit offsets.
  394. /// Do not generate relative lookup tables for large code models
  395. /// in 64-bit achitectures where 32-bit offsets might not be enough.
  396. if (TM.getCodeModel() == CodeModel::Medium ||
  397. TM.getCodeModel() == CodeModel::Large)
  398. return false;
  399. Triple TargetTriple = TM.getTargetTriple();
  400. if (!TargetTriple.isArch64Bit())
  401. return false;
  402. // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
  403. // there.
  404. if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
  405. return false;
  406. return true;
  407. }
  408. bool haveFastSqrt(Type *Ty) {
  409. const TargetLoweringBase *TLI = getTLI();
  410. EVT VT = TLI->getValueType(DL, Ty);
  411. return TLI->isTypeLegal(VT) &&
  412. TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
  413. }
  414. bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
  415. return true;
  416. }
  417. InstructionCost getFPOpCost(Type *Ty) {
  418. // Check whether FADD is available, as a proxy for floating-point in
  419. // general.
  420. const TargetLoweringBase *TLI = getTLI();
  421. EVT VT = TLI->getValueType(DL, Ty);
  422. if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
  423. return TargetTransformInfo::TCC_Basic;
  424. return TargetTransformInfo::TCC_Expensive;
  425. }
  426. unsigned getInliningThresholdMultiplier() { return 1; }
  427. unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
  428. int getInlinerVectorBonusPercent() { return 150; }
  429. void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
  430. TTI::UnrollingPreferences &UP,
  431. OptimizationRemarkEmitter *ORE) {
  432. // This unrolling functionality is target independent, but to provide some
  433. // motivation for its intended use, for x86:
  434. // According to the Intel 64 and IA-32 Architectures Optimization Reference
  435. // Manual, Intel Core models and later have a loop stream detector (and
  436. // associated uop queue) that can benefit from partial unrolling.
  437. // The relevant requirements are:
  438. // - The loop must have no more than 4 (8 for Nehalem and later) branches
  439. // taken, and none of them may be calls.
  440. // - The loop can have no more than 18 (28 for Nehalem and later) uops.
  441. // According to the Software Optimization Guide for AMD Family 15h
  442. // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
  443. // and loop buffer which can benefit from partial unrolling.
  444. // The relevant requirements are:
  445. // - The loop must have fewer than 16 branches
  446. // - The loop must have less than 40 uops in all executed loop branches
  447. // The number of taken branches in a loop is hard to estimate here, and
  448. // benchmarking has revealed that it is better not to be conservative when
  449. // estimating the branch count. As a result, we'll ignore the branch limits
  450. // until someone finds a case where it matters in practice.
  451. unsigned MaxOps;
  452. const TargetSubtargetInfo *ST = getST();
  453. if (PartialUnrollingThreshold.getNumOccurrences() > 0)
  454. MaxOps = PartialUnrollingThreshold;
  455. else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
  456. MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
  457. else
  458. return;
  459. // Scan the loop: don't unroll loops with calls.
  460. for (BasicBlock *BB : L->blocks()) {
  461. for (Instruction &I : *BB) {
  462. if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
  463. if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
  464. if (!thisT()->isLoweredToCall(F))
  465. continue;
  466. }
  467. if (ORE) {
  468. ORE->emit([&]() {
  469. return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
  470. L->getHeader())
  471. << "advising against unrolling the loop because it "
  472. "contains a "
  473. << ore::NV("Call", &I);
  474. });
  475. }
  476. return;
  477. }
  478. }
  479. }
  480. // Enable runtime and partial unrolling up to the specified size.
  481. // Enable using trip count upper bound to unroll loops.
  482. UP.Partial = UP.Runtime = UP.UpperBound = true;
  483. UP.PartialThreshold = MaxOps;
  484. // Avoid unrolling when optimizing for size.
  485. UP.OptSizeThreshold = 0;
  486. UP.PartialOptSizeThreshold = 0;
  487. // Set number of instructions optimized when "back edge"
  488. // becomes "fall through" to default value of 2.
  489. UP.BEInsns = 2;
  490. }
  491. void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
  492. TTI::PeelingPreferences &PP) {
  493. PP.PeelCount = 0;
  494. PP.AllowPeeling = true;
  495. PP.AllowLoopNestsPeeling = false;
  496. PP.PeelProfiledIterations = true;
  497. }
  498. bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
  499. AssumptionCache &AC,
  500. TargetLibraryInfo *LibInfo,
  501. HardwareLoopInfo &HWLoopInfo) {
  502. return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
  503. }
  504. bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
  505. AssumptionCache &AC, TargetLibraryInfo *TLI,
  506. DominatorTree *DT,
  507. const LoopAccessInfo *LAI) {
  508. return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
  509. }
  510. bool emitGetActiveLaneMask() {
  511. return BaseT::emitGetActiveLaneMask();
  512. }
  513. Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
  514. IntrinsicInst &II) {
  515. return BaseT::instCombineIntrinsic(IC, II);
  516. }
  517. Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC,
  518. IntrinsicInst &II,
  519. APInt DemandedMask,
  520. KnownBits &Known,
  521. bool &KnownBitsComputed) {
  522. return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
  523. KnownBitsComputed);
  524. }
  525. Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
  526. InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
  527. APInt &UndefElts2, APInt &UndefElts3,
  528. std::function<void(Instruction *, unsigned, APInt, APInt &)>
  529. SimplifyAndSetOp) {
  530. return BaseT::simplifyDemandedVectorEltsIntrinsic(
  531. IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
  532. SimplifyAndSetOp);
  533. }
  534. InstructionCost getInstructionLatency(const Instruction *I) {
  535. if (isa<LoadInst>(I))
  536. return getST()->getSchedModel().DefaultLoadLatency;
  537. return BaseT::getInstructionLatency(I);
  538. }
  539. virtual Optional<unsigned>
  540. getCacheSize(TargetTransformInfo::CacheLevel Level) const {
  541. return Optional<unsigned>(
  542. getST()->getCacheSize(static_cast<unsigned>(Level)));
  543. }
  544. virtual Optional<unsigned>
  545. getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
  546. Optional<unsigned> TargetResult =
  547. getST()->getCacheAssociativity(static_cast<unsigned>(Level));
  548. if (TargetResult)
  549. return TargetResult;
  550. return BaseT::getCacheAssociativity(Level);
  551. }
  552. virtual unsigned getCacheLineSize() const {
  553. return getST()->getCacheLineSize();
  554. }
  555. virtual unsigned getPrefetchDistance() const {
  556. return getST()->getPrefetchDistance();
  557. }
  558. virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
  559. unsigned NumStridedMemAccesses,
  560. unsigned NumPrefetches,
  561. bool HasCall) const {
  562. return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
  563. NumPrefetches, HasCall);
  564. }
  565. virtual unsigned getMaxPrefetchIterationsAhead() const {
  566. return getST()->getMaxPrefetchIterationsAhead();
  567. }
  568. virtual bool enableWritePrefetching() const {
  569. return getST()->enableWritePrefetching();
  570. }
  571. /// @}
  572. /// \name Vector TTI Implementations
  573. /// @{
  574. TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
  575. return TypeSize::getFixed(32);
  576. }
  577. Optional<unsigned> getMaxVScale() const { return None; }
  578. Optional<unsigned> getVScaleForTuning() const { return None; }
  579. /// Estimate the overhead of scalarizing an instruction. Insert and Extract
  580. /// are set if the demanded result elements need to be inserted and/or
  581. /// extracted from vectors.
  582. InstructionCost getScalarizationOverhead(VectorType *InTy,
  583. const APInt &DemandedElts,
  584. bool Insert, bool Extract) {
  585. /// FIXME: a bitfield is not a reasonable abstraction for talking about
  586. /// which elements are needed from a scalable vector
  587. auto *Ty = cast<FixedVectorType>(InTy);
  588. assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
  589. "Vector size mismatch");
  590. InstructionCost Cost = 0;
  591. for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
  592. if (!DemandedElts[i])
  593. continue;
  594. if (Insert)
  595. Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i);
  596. if (Extract)
  597. Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
  598. }
  599. return Cost;
  600. }
  601. /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
  602. InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert,
  603. bool Extract) {
  604. auto *Ty = cast<FixedVectorType>(InTy);
  605. APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
  606. return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
  607. }
  608. /// Estimate the overhead of scalarizing an instructions unique
  609. /// non-constant operands. The (potentially vector) types to use for each of
  610. /// argument are passes via Tys.
  611. InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
  612. ArrayRef<Type *> Tys) {
  613. assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
  614. InstructionCost Cost = 0;
  615. SmallPtrSet<const Value*, 4> UniqueOperands;
  616. for (int I = 0, E = Args.size(); I != E; I++) {
  617. // Disregard things like metadata arguments.
  618. const Value *A = Args[I];
  619. Type *Ty = Tys[I];
  620. if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
  621. !Ty->isPtrOrPtrVectorTy())
  622. continue;
  623. if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
  624. if (auto *VecTy = dyn_cast<VectorType>(Ty))
  625. Cost += getScalarizationOverhead(VecTy, false, true);
  626. }
  627. }
  628. return Cost;
  629. }
  630. /// Estimate the overhead of scalarizing the inputs and outputs of an
  631. /// instruction, with return type RetTy and arguments Args of type Tys. If
  632. /// Args are unknown (empty), then the cost associated with one argument is
  633. /// added as a heuristic.
  634. InstructionCost getScalarizationOverhead(VectorType *RetTy,
  635. ArrayRef<const Value *> Args,
  636. ArrayRef<Type *> Tys) {
  637. InstructionCost Cost = getScalarizationOverhead(RetTy, true, false);
  638. if (!Args.empty())
  639. Cost += getOperandsScalarizationOverhead(Args, Tys);
  640. else
  641. // When no information on arguments is provided, we add the cost
  642. // associated with one argument as a heuristic.
  643. Cost += getScalarizationOverhead(RetTy, false, true);
  644. return Cost;
  645. }
  646. unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
  647. InstructionCost getArithmeticInstrCost(
  648. unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
  649. TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
  650. TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
  651. TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
  652. TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
  653. ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
  654. const Instruction *CxtI = nullptr) {
  655. // Check if any of the operands are vector operands.
  656. const TargetLoweringBase *TLI = getTLI();
  657. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  658. assert(ISD && "Invalid opcode");
  659. // TODO: Handle more cost kinds.
  660. if (CostKind != TTI::TCK_RecipThroughput)
  661. return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
  662. Opd1Info, Opd2Info,
  663. Opd1PropInfo, Opd2PropInfo,
  664. Args, CxtI);
  665. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
  666. bool IsFloat = Ty->isFPOrFPVectorTy();
  667. // Assume that floating point arithmetic operations cost twice as much as
  668. // integer operations.
  669. InstructionCost OpCost = (IsFloat ? 2 : 1);
  670. if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
  671. // The operation is legal. Assume it costs 1.
  672. // TODO: Once we have extract/insert subvector cost we need to use them.
  673. return LT.first * OpCost;
  674. }
  675. if (!TLI->isOperationExpand(ISD, LT.second)) {
  676. // If the operation is custom lowered, then assume that the code is twice
  677. // as expensive.
  678. return LT.first * 2 * OpCost;
  679. }
  680. // An 'Expand' of URem and SRem is special because it may default
  681. // to expanding the operation into a sequence of sub-operations
  682. // i.e. X % Y -> X-(X/Y)*Y.
  683. if (ISD == ISD::UREM || ISD == ISD::SREM) {
  684. bool IsSigned = ISD == ISD::SREM;
  685. if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
  686. LT.second) ||
  687. TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
  688. LT.second)) {
  689. unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
  690. InstructionCost DivCost = thisT()->getArithmeticInstrCost(
  691. DivOpc, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo,
  692. Opd2PropInfo);
  693. InstructionCost MulCost =
  694. thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
  695. InstructionCost SubCost =
  696. thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
  697. return DivCost + MulCost + SubCost;
  698. }
  699. }
  700. // We cannot scalarize scalable vectors, so return Invalid.
  701. if (isa<ScalableVectorType>(Ty))
  702. return InstructionCost::getInvalid();
  703. // Else, assume that we need to scalarize this op.
  704. // TODO: If one of the types get legalized by splitting, handle this
  705. // similarly to what getCastInstrCost() does.
  706. if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
  707. InstructionCost Cost = thisT()->getArithmeticInstrCost(
  708. Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
  709. Opd1PropInfo, Opd2PropInfo, Args, CxtI);
  710. // Return the cost of multiple scalar invocation plus the cost of
  711. // inserting and extracting the values.
  712. SmallVector<Type *> Tys(Args.size(), Ty);
  713. return getScalarizationOverhead(VTy, Args, Tys) +
  714. VTy->getNumElements() * Cost;
  715. }
  716. // We don't know anything about this scalar instruction.
  717. return OpCost;
  718. }
  719. TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
  720. ArrayRef<int> Mask) const {
  721. int Limit = Mask.size() * 2;
  722. if (Mask.empty() ||
  723. // Extra check required by isSingleSourceMaskImpl function (called by
  724. // ShuffleVectorInst::isSingleSourceMask).
  725. any_of(Mask, [Limit](int I) { return I >= Limit; }))
  726. return Kind;
  727. switch (Kind) {
  728. case TTI::SK_PermuteSingleSrc:
  729. if (ShuffleVectorInst::isReverseMask(Mask))
  730. return TTI::SK_Reverse;
  731. if (ShuffleVectorInst::isZeroEltSplatMask(Mask))
  732. return TTI::SK_Broadcast;
  733. break;
  734. case TTI::SK_PermuteTwoSrc:
  735. if (ShuffleVectorInst::isSelectMask(Mask))
  736. return TTI::SK_Select;
  737. if (ShuffleVectorInst::isTransposeMask(Mask))
  738. return TTI::SK_Transpose;
  739. break;
  740. case TTI::SK_Select:
  741. case TTI::SK_Reverse:
  742. case TTI::SK_Broadcast:
  743. case TTI::SK_Transpose:
  744. case TTI::SK_InsertSubvector:
  745. case TTI::SK_ExtractSubvector:
  746. case TTI::SK_Splice:
  747. break;
  748. }
  749. return Kind;
  750. }
  751. InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
  752. ArrayRef<int> Mask, int Index,
  753. VectorType *SubTp) {
  754. switch (improveShuffleKindFromMask(Kind, Mask)) {
  755. case TTI::SK_Broadcast:
  756. if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
  757. return getBroadcastShuffleOverhead(FVT);
  758. return InstructionCost::getInvalid();
  759. case TTI::SK_Select:
  760. case TTI::SK_Splice:
  761. case TTI::SK_Reverse:
  762. case TTI::SK_Transpose:
  763. case TTI::SK_PermuteSingleSrc:
  764. case TTI::SK_PermuteTwoSrc:
  765. if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
  766. return getPermuteShuffleOverhead(FVT);
  767. return InstructionCost::getInvalid();
  768. case TTI::SK_ExtractSubvector:
  769. return getExtractSubvectorOverhead(Tp, Index,
  770. cast<FixedVectorType>(SubTp));
  771. case TTI::SK_InsertSubvector:
  772. return getInsertSubvectorOverhead(Tp, Index,
  773. cast<FixedVectorType>(SubTp));
  774. }
  775. llvm_unreachable("Unknown TTI::ShuffleKind");
  776. }
  777. InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
  778. TTI::CastContextHint CCH,
  779. TTI::TargetCostKind CostKind,
  780. const Instruction *I = nullptr) {
  781. if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
  782. return 0;
  783. const TargetLoweringBase *TLI = getTLI();
  784. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  785. assert(ISD && "Invalid opcode");
  786. std::pair<InstructionCost, MVT> SrcLT =
  787. TLI->getTypeLegalizationCost(DL, Src);
  788. std::pair<InstructionCost, MVT> DstLT =
  789. TLI->getTypeLegalizationCost(DL, Dst);
  790. TypeSize SrcSize = SrcLT.second.getSizeInBits();
  791. TypeSize DstSize = DstLT.second.getSizeInBits();
  792. bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
  793. bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
  794. switch (Opcode) {
  795. default:
  796. break;
  797. case Instruction::Trunc:
  798. // Check for NOOP conversions.
  799. if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
  800. return 0;
  801. LLVM_FALLTHROUGH;
  802. case Instruction::BitCast:
  803. // Bitcast between types that are legalized to the same type are free and
  804. // assume int to/from ptr of the same size is also free.
  805. if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
  806. SrcSize == DstSize)
  807. return 0;
  808. break;
  809. case Instruction::FPExt:
  810. if (I && getTLI()->isExtFree(I))
  811. return 0;
  812. break;
  813. case Instruction::ZExt:
  814. if (TLI->isZExtFree(SrcLT.second, DstLT.second))
  815. return 0;
  816. LLVM_FALLTHROUGH;
  817. case Instruction::SExt:
  818. if (I && getTLI()->isExtFree(I))
  819. return 0;
  820. // If this is a zext/sext of a load, return 0 if the corresponding
  821. // extending load exists on target and the result type is legal.
  822. if (CCH == TTI::CastContextHint::Normal) {
  823. EVT ExtVT = EVT::getEVT(Dst);
  824. EVT LoadVT = EVT::getEVT(Src);
  825. unsigned LType =
  826. ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
  827. if (DstLT.first == SrcLT.first &&
  828. TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
  829. return 0;
  830. }
  831. break;
  832. case Instruction::AddrSpaceCast:
  833. if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
  834. Dst->getPointerAddressSpace()))
  835. return 0;
  836. break;
  837. }
  838. auto *SrcVTy = dyn_cast<VectorType>(Src);
  839. auto *DstVTy = dyn_cast<VectorType>(Dst);
  840. // If the cast is marked as legal (or promote) then assume low cost.
  841. if (SrcLT.first == DstLT.first &&
  842. TLI->isOperationLegalOrPromote(ISD, DstLT.second))
  843. return SrcLT.first;
  844. // Handle scalar conversions.
  845. if (!SrcVTy && !DstVTy) {
  846. // Just check the op cost. If the operation is legal then assume it costs
  847. // 1.
  848. if (!TLI->isOperationExpand(ISD, DstLT.second))
  849. return 1;
  850. // Assume that illegal scalar instruction are expensive.
  851. return 4;
  852. }
  853. // Check vector-to-vector casts.
  854. if (DstVTy && SrcVTy) {
  855. // If the cast is between same-sized registers, then the check is simple.
  856. if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
  857. // Assume that Zext is done using AND.
  858. if (Opcode == Instruction::ZExt)
  859. return SrcLT.first;
  860. // Assume that sext is done using SHL and SRA.
  861. if (Opcode == Instruction::SExt)
  862. return SrcLT.first * 2;
  863. // Just check the op cost. If the operation is legal then assume it
  864. // costs
  865. // 1 and multiply by the type-legalization overhead.
  866. if (!TLI->isOperationExpand(ISD, DstLT.second))
  867. return SrcLT.first * 1;
  868. }
  869. // If we are legalizing by splitting, query the concrete TTI for the cost
  870. // of casting the original vector twice. We also need to factor in the
  871. // cost of the split itself. Count that as 1, to be consistent with
  872. // TLI->getTypeLegalizationCost().
  873. bool SplitSrc =
  874. TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
  875. TargetLowering::TypeSplitVector;
  876. bool SplitDst =
  877. TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
  878. TargetLowering::TypeSplitVector;
  879. if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
  880. DstVTy->getElementCount().isVector()) {
  881. Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
  882. Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
  883. T *TTI = static_cast<T *>(this);
  884. // If both types need to be split then the split is free.
  885. InstructionCost SplitCost =
  886. (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
  887. return SplitCost +
  888. (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
  889. CostKind, I));
  890. }
  891. // Scalarization cost is Invalid, can't assume any num elements.
  892. if (isa<ScalableVectorType>(DstVTy))
  893. return InstructionCost::getInvalid();
  894. // In other cases where the source or destination are illegal, assume
  895. // the operation will get scalarized.
  896. unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
  897. InstructionCost Cost = thisT()->getCastInstrCost(
  898. Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
  899. // Return the cost of multiple scalar invocation plus the cost of
  900. // inserting and extracting the values.
  901. return getScalarizationOverhead(DstVTy, true, true) + Num * Cost;
  902. }
  903. // We already handled vector-to-vector and scalar-to-scalar conversions.
  904. // This
  905. // is where we handle bitcast between vectors and scalars. We need to assume
  906. // that the conversion is scalarized in one way or another.
  907. if (Opcode == Instruction::BitCast) {
  908. // Illegal bitcasts are done by storing and loading from a stack slot.
  909. return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) +
  910. (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0);
  911. }
  912. llvm_unreachable("Unhandled cast");
  913. }
  914. InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
  915. VectorType *VecTy, unsigned Index) {
  916. return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
  917. Index) +
  918. thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
  919. TTI::CastContextHint::None,
  920. TTI::TCK_RecipThroughput);
  921. }
  922. InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
  923. const Instruction *I = nullptr) {
  924. return BaseT::getCFInstrCost(Opcode, CostKind, I);
  925. }
  926. InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
  927. CmpInst::Predicate VecPred,
  928. TTI::TargetCostKind CostKind,
  929. const Instruction *I = nullptr) {
  930. const TargetLoweringBase *TLI = getTLI();
  931. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  932. assert(ISD && "Invalid opcode");
  933. // TODO: Handle other cost kinds.
  934. if (CostKind != TTI::TCK_RecipThroughput)
  935. return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
  936. I);
  937. // Selects on vectors are actually vector selects.
  938. if (ISD == ISD::SELECT) {
  939. assert(CondTy && "CondTy must exist");
  940. if (CondTy->isVectorTy())
  941. ISD = ISD::VSELECT;
  942. }
  943. std::pair<InstructionCost, MVT> LT =
  944. TLI->getTypeLegalizationCost(DL, ValTy);
  945. if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
  946. !TLI->isOperationExpand(ISD, LT.second)) {
  947. // The operation is legal. Assume it costs 1. Multiply
  948. // by the type-legalization overhead.
  949. return LT.first * 1;
  950. }
  951. // Otherwise, assume that the cast is scalarized.
  952. // TODO: If one of the types get legalized by splitting, handle this
  953. // similarly to what getCastInstrCost() does.
  954. if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
  955. unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
  956. if (CondTy)
  957. CondTy = CondTy->getScalarType();
  958. InstructionCost Cost = thisT()->getCmpSelInstrCost(
  959. Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
  960. // Return the cost of multiple scalar invocation plus the cost of
  961. // inserting and extracting the values.
  962. return getScalarizationOverhead(ValVTy, true, false) + Num * Cost;
  963. }
  964. // Unknown scalar opcode.
  965. return 1;
  966. }
  967. InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
  968. unsigned Index) {
  969. std::pair<InstructionCost, MVT> LT =
  970. getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());
  971. return LT.first;
  972. }
  973. InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
  974. int VF,
  975. const APInt &DemandedDstElts,
  976. TTI::TargetCostKind CostKind) {
  977. assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
  978. "Unexpected size of DemandedDstElts.");
  979. InstructionCost Cost;
  980. auto *SrcVT = FixedVectorType::get(EltTy, VF);
  981. auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor);
  982. // The Mask shuffling cost is extract all the elements of the Mask
  983. // and insert each of them Factor times into the wide vector:
  984. //
  985. // E.g. an interleaved group with factor 3:
  986. // %mask = icmp ult <8 x i32> %vec1, %vec2
  987. // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
  988. // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
  989. // The cost is estimated as extract all mask elements from the <8xi1> mask
  990. // vector and insert them factor times into the <24xi1> shuffled mask
  991. // vector.
  992. APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
  993. Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
  994. /*Insert*/ false,
  995. /*Extract*/ true);
  996. Cost +=
  997. thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
  998. /*Insert*/ true, /*Extract*/ false);
  999. return Cost;
  1000. }
  1001. InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
  1002. MaybeAlign Alignment, unsigned AddressSpace,
  1003. TTI::TargetCostKind CostKind,
  1004. const Instruction *I = nullptr) {
  1005. assert(!Src->isVoidTy() && "Invalid type");
  1006. // Assume types, such as structs, are expensive.
  1007. if (getTLI()->getValueType(DL, Src, true) == MVT::Other)
  1008. return 4;
  1009. std::pair<InstructionCost, MVT> LT =
  1010. getTLI()->getTypeLegalizationCost(DL, Src);
  1011. // Assuming that all loads of legal types cost 1.
  1012. InstructionCost Cost = LT.first;
  1013. if (CostKind != TTI::TCK_RecipThroughput)
  1014. return Cost;
  1015. if (Src->isVectorTy() &&
  1016. // In practice it's not currently possible to have a change in lane
  1017. // length for extending loads or truncating stores so both types should
  1018. // have the same scalable property.
  1019. TypeSize::isKnownLT(Src->getPrimitiveSizeInBits(),
  1020. LT.second.getSizeInBits())) {
  1021. // This is a vector load that legalizes to a larger type than the vector
  1022. // itself. Unless the corresponding extending load or truncating store is
  1023. // legal, then this will scalarize.
  1024. TargetLowering::LegalizeAction LA = TargetLowering::Expand;
  1025. EVT MemVT = getTLI()->getValueType(DL, Src);
  1026. if (Opcode == Instruction::Store)
  1027. LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
  1028. else
  1029. LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
  1030. if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
  1031. // This is a vector load/store for some illegal type that is scalarized.
  1032. // We must account for the cost of building or decomposing the vector.
  1033. Cost += getScalarizationOverhead(cast<VectorType>(Src),
  1034. Opcode != Instruction::Store,
  1035. Opcode == Instruction::Store);
  1036. }
  1037. }
  1038. return Cost;
  1039. }
  1040. InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
  1041. Align Alignment, unsigned AddressSpace,
  1042. TTI::TargetCostKind CostKind) {
  1043. return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
  1044. CostKind);
  1045. }
  1046. InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
  1047. const Value *Ptr, bool VariableMask,
  1048. Align Alignment,
  1049. TTI::TargetCostKind CostKind,
  1050. const Instruction *I = nullptr) {
  1051. return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
  1052. true, CostKind);
  1053. }
  1054. InstructionCost getInterleavedMemoryOpCost(
  1055. unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
  1056. Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
  1057. bool UseMaskForCond = false, bool UseMaskForGaps = false) {
  1058. auto *VT = cast<FixedVectorType>(VecTy);
  1059. unsigned NumElts = VT->getNumElements();
  1060. assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
  1061. unsigned NumSubElts = NumElts / Factor;
  1062. auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
  1063. // Firstly, the cost of load/store operation.
  1064. InstructionCost Cost;
  1065. if (UseMaskForCond || UseMaskForGaps)
  1066. Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
  1067. AddressSpace, CostKind);
  1068. else
  1069. Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
  1070. CostKind);
  1071. // Legalize the vector type, and get the legalized and unlegalized type
  1072. // sizes.
  1073. MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
  1074. unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
  1075. unsigned VecTyLTSize = VecTyLT.getStoreSize();
  1076. // Scale the cost of the memory operation by the fraction of legalized
  1077. // instructions that will actually be used. We shouldn't account for the
  1078. // cost of dead instructions since they will be removed.
  1079. //
  1080. // E.g., An interleaved load of factor 8:
  1081. // %vec = load <16 x i64>, <16 x i64>* %ptr
  1082. // %v0 = shufflevector %vec, undef, <0, 8>
  1083. //
  1084. // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
  1085. // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
  1086. // type). The other loads are unused.
  1087. //
  1088. // TODO: Note that legalization can turn masked loads/stores into unmasked
  1089. // (legalized) loads/stores. This can be reflected in the cost.
  1090. if (Cost.isValid() && VecTySize > VecTyLTSize) {
  1091. // The number of loads of a legal type it will take to represent a load
  1092. // of the unlegalized vector type.
  1093. unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
  1094. // The number of elements of the unlegalized type that correspond to a
  1095. // single legal instruction.
  1096. unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
  1097. // Determine which legal instructions will be used.
  1098. BitVector UsedInsts(NumLegalInsts, false);
  1099. for (unsigned Index : Indices)
  1100. for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
  1101. UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
  1102. // Scale the cost of the load by the fraction of legal instructions that
  1103. // will be used.
  1104. Cost = divideCeil(UsedInsts.count() * Cost.getValue().getValue(),
  1105. NumLegalInsts);
  1106. }
  1107. // Then plus the cost of interleave operation.
  1108. assert(Indices.size() <= Factor &&
  1109. "Interleaved memory op has too many members");
  1110. const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
  1111. const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
  1112. APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
  1113. for (unsigned Index : Indices) {
  1114. assert(Index < Factor && "Invalid index for interleaved memory op");
  1115. for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
  1116. DemandedLoadStoreElts.setBit(Index + Elm * Factor);
  1117. }
  1118. if (Opcode == Instruction::Load) {
  1119. // The interleave cost is similar to extract sub vectors' elements
  1120. // from the wide vector, and insert them into sub vectors.
  1121. //
  1122. // E.g. An interleaved load of factor 2 (with one member of index 0):
  1123. // %vec = load <8 x i32>, <8 x i32>* %ptr
  1124. // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
  1125. // The cost is estimated as extract elements at 0, 2, 4, 6 from the
  1126. // <8 x i32> vector and insert them into a <4 x i32> vector.
  1127. InstructionCost InsSubCost =
  1128. thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts,
  1129. /*Insert*/ true, /*Extract*/ false);
  1130. Cost += Indices.size() * InsSubCost;
  1131. Cost +=
  1132. thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
  1133. /*Insert*/ false, /*Extract*/ true);
  1134. } else {
  1135. // The interleave cost is extract elements from sub vectors, and
  1136. // insert them into the wide vector.
  1137. //
  1138. // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
  1139. // (using VF=4):
  1140. // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
  1141. // %gaps.mask = <true, true, false, true, true, false,
  1142. // true, true, false, true, true, false>
  1143. // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
  1144. // i32 Align, <12 x i1> %gaps.mask
  1145. // The cost is estimated as extract all elements (of actual members,
  1146. // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
  1147. // i32> vector.
  1148. InstructionCost ExtSubCost =
  1149. thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts,
  1150. /*Insert*/ false, /*Extract*/ true);
  1151. Cost += ExtSubCost * Indices.size();
  1152. Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
  1153. /*Insert*/ true,
  1154. /*Extract*/ false);
  1155. }
  1156. if (!UseMaskForCond)
  1157. return Cost;
  1158. Type *I8Type = Type::getInt8Ty(VT->getContext());
  1159. Cost += thisT()->getReplicationShuffleCost(
  1160. I8Type, Factor, NumSubElts,
  1161. UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
  1162. CostKind);
  1163. // The Gaps mask is invariant and created outside the loop, therefore the
  1164. // cost of creating it is not accounted for here. However if we have both
  1165. // a MaskForGaps and some other mask that guards the execution of the
  1166. // memory access, we need to account for the cost of And-ing the two masks
  1167. // inside the loop.
  1168. if (UseMaskForGaps) {
  1169. auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
  1170. Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
  1171. CostKind);
  1172. }
  1173. return Cost;
  1174. }
  1175. /// Get intrinsic cost based on arguments.
  1176. InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
  1177. TTI::TargetCostKind CostKind) {
  1178. // Check for generically free intrinsics.
  1179. if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
  1180. return 0;
  1181. // Assume that target intrinsics are cheap.
  1182. Intrinsic::ID IID = ICA.getID();
  1183. if (Function::isTargetIntrinsic(IID))
  1184. return TargetTransformInfo::TCC_Basic;
  1185. if (ICA.isTypeBasedOnly())
  1186. return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
  1187. Type *RetTy = ICA.getReturnType();
  1188. ElementCount RetVF =
  1189. (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
  1190. : ElementCount::getFixed(1));
  1191. const IntrinsicInst *I = ICA.getInst();
  1192. const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
  1193. FastMathFlags FMF = ICA.getFlags();
  1194. switch (IID) {
  1195. default:
  1196. break;
  1197. case Intrinsic::cttz:
  1198. // FIXME: If necessary, this should go in target-specific overrides.
  1199. if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
  1200. return TargetTransformInfo::TCC_Basic;
  1201. break;
  1202. case Intrinsic::ctlz:
  1203. // FIXME: If necessary, this should go in target-specific overrides.
  1204. if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz())
  1205. return TargetTransformInfo::TCC_Basic;
  1206. break;
  1207. case Intrinsic::memcpy:
  1208. return thisT()->getMemcpyCost(ICA.getInst());
  1209. case Intrinsic::masked_scatter: {
  1210. const Value *Mask = Args[3];
  1211. bool VarMask = !isa<Constant>(Mask);
  1212. Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
  1213. return thisT()->getGatherScatterOpCost(Instruction::Store,
  1214. ICA.getArgTypes()[0], Args[1],
  1215. VarMask, Alignment, CostKind, I);
  1216. }
  1217. case Intrinsic::masked_gather: {
  1218. const Value *Mask = Args[2];
  1219. bool VarMask = !isa<Constant>(Mask);
  1220. Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
  1221. return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
  1222. VarMask, Alignment, CostKind, I);
  1223. }
  1224. case Intrinsic::experimental_stepvector: {
  1225. if (isa<ScalableVectorType>(RetTy))
  1226. return BaseT::getIntrinsicInstrCost(ICA, CostKind);
  1227. // The cost of materialising a constant integer vector.
  1228. return TargetTransformInfo::TCC_Basic;
  1229. }
  1230. case Intrinsic::experimental_vector_extract: {
  1231. // FIXME: Handle case where a scalable vector is extracted from a scalable
  1232. // vector
  1233. if (isa<ScalableVectorType>(RetTy))
  1234. return BaseT::getIntrinsicInstrCost(ICA, CostKind);
  1235. unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
  1236. return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
  1237. cast<VectorType>(Args[0]->getType()), None,
  1238. Index, cast<VectorType>(RetTy));
  1239. }
  1240. case Intrinsic::experimental_vector_insert: {
  1241. // FIXME: Handle case where a scalable vector is inserted into a scalable
  1242. // vector
  1243. if (isa<ScalableVectorType>(Args[1]->getType()))
  1244. return BaseT::getIntrinsicInstrCost(ICA, CostKind);
  1245. unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
  1246. return thisT()->getShuffleCost(
  1247. TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), None,
  1248. Index, cast<VectorType>(Args[1]->getType()));
  1249. }
  1250. case Intrinsic::experimental_vector_reverse: {
  1251. return thisT()->getShuffleCost(TTI::SK_Reverse,
  1252. cast<VectorType>(Args[0]->getType()), None,
  1253. 0, cast<VectorType>(RetTy));
  1254. }
  1255. case Intrinsic::experimental_vector_splice: {
  1256. unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
  1257. return thisT()->getShuffleCost(TTI::SK_Splice,
  1258. cast<VectorType>(Args[0]->getType()), None,
  1259. Index, cast<VectorType>(RetTy));
  1260. }
  1261. case Intrinsic::vector_reduce_add:
  1262. case Intrinsic::vector_reduce_mul:
  1263. case Intrinsic::vector_reduce_and:
  1264. case Intrinsic::vector_reduce_or:
  1265. case Intrinsic::vector_reduce_xor:
  1266. case Intrinsic::vector_reduce_smax:
  1267. case Intrinsic::vector_reduce_smin:
  1268. case Intrinsic::vector_reduce_fmax:
  1269. case Intrinsic::vector_reduce_fmin:
  1270. case Intrinsic::vector_reduce_umax:
  1271. case Intrinsic::vector_reduce_umin: {
  1272. IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
  1273. return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
  1274. }
  1275. case Intrinsic::vector_reduce_fadd:
  1276. case Intrinsic::vector_reduce_fmul: {
  1277. IntrinsicCostAttributes Attrs(
  1278. IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
  1279. return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
  1280. }
  1281. case Intrinsic::fshl:
  1282. case Intrinsic::fshr: {
  1283. if (isa<ScalableVectorType>(RetTy))
  1284. return BaseT::getIntrinsicInstrCost(ICA, CostKind);
  1285. const Value *X = Args[0];
  1286. const Value *Y = Args[1];
  1287. const Value *Z = Args[2];
  1288. TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW;
  1289. TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX);
  1290. TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY);
  1291. TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ);
  1292. TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue;
  1293. OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
  1294. : TTI::OP_None;
  1295. // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
  1296. // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
  1297. InstructionCost Cost = 0;
  1298. Cost +=
  1299. thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
  1300. Cost +=
  1301. thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
  1302. Cost += thisT()->getArithmeticInstrCost(
  1303. BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX);
  1304. Cost += thisT()->getArithmeticInstrCost(
  1305. BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY);
  1306. // Non-constant shift amounts requires a modulo.
  1307. if (OpKindZ != TTI::OK_UniformConstantValue &&
  1308. OpKindZ != TTI::OK_NonUniformConstantValue)
  1309. Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
  1310. CostKind, OpKindZ, OpKindBW,
  1311. OpPropsZ, OpPropsBW);
  1312. // For non-rotates (X != Y) we must add shift-by-zero handling costs.
  1313. if (X != Y) {
  1314. Type *CondTy = RetTy->getWithNewBitWidth(1);
  1315. Cost +=
  1316. thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
  1317. CmpInst::ICMP_EQ, CostKind);
  1318. Cost +=
  1319. thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
  1320. CmpInst::ICMP_EQ, CostKind);
  1321. }
  1322. return Cost;
  1323. }
  1324. }
  1325. // Assume that we need to scalarize this intrinsic.
  1326. // Compute the scalarization overhead based on Args for a vector
  1327. // intrinsic.
  1328. InstructionCost ScalarizationCost = InstructionCost::getInvalid();
  1329. if (RetVF.isVector() && !RetVF.isScalable()) {
  1330. ScalarizationCost = 0;
  1331. if (!RetTy->isVoidTy())
  1332. ScalarizationCost +=
  1333. getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
  1334. ScalarizationCost +=
  1335. getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
  1336. }
  1337. IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
  1338. ScalarizationCost);
  1339. return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
  1340. }
  1341. /// Get intrinsic cost based on argument types.
  1342. /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
  1343. /// cost of scalarizing the arguments and the return value will be computed
  1344. /// based on types.
  1345. InstructionCost
  1346. getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
  1347. TTI::TargetCostKind CostKind) {
  1348. Intrinsic::ID IID = ICA.getID();
  1349. Type *RetTy = ICA.getReturnType();
  1350. const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
  1351. FastMathFlags FMF = ICA.getFlags();
  1352. InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
  1353. bool SkipScalarizationCost = ICA.skipScalarizationCost();
  1354. VectorType *VecOpTy = nullptr;
  1355. if (!Tys.empty()) {
  1356. // The vector reduction operand is operand 0 except for fadd/fmul.
  1357. // Their operand 0 is a scalar start value, so the vector op is operand 1.
  1358. unsigned VecTyIndex = 0;
  1359. if (IID == Intrinsic::vector_reduce_fadd ||
  1360. IID == Intrinsic::vector_reduce_fmul)
  1361. VecTyIndex = 1;
  1362. assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
  1363. VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
  1364. }
  1365. // Library call cost - other than size, make it expensive.
  1366. unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
  1367. SmallVector<unsigned, 2> ISDs;
  1368. switch (IID) {
  1369. default: {
  1370. // Scalable vectors cannot be scalarized, so return Invalid.
  1371. if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
  1372. return isa<ScalableVectorType>(Ty);
  1373. }))
  1374. return InstructionCost::getInvalid();
  1375. // Assume that we need to scalarize this intrinsic.
  1376. InstructionCost ScalarizationCost =
  1377. SkipScalarizationCost ? ScalarizationCostPassed : 0;
  1378. unsigned ScalarCalls = 1;
  1379. Type *ScalarRetTy = RetTy;
  1380. if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
  1381. if (!SkipScalarizationCost)
  1382. ScalarizationCost = getScalarizationOverhead(RetVTy, true, false);
  1383. ScalarCalls = std::max(ScalarCalls,
  1384. cast<FixedVectorType>(RetVTy)->getNumElements());
  1385. ScalarRetTy = RetTy->getScalarType();
  1386. }
  1387. SmallVector<Type *, 4> ScalarTys;
  1388. for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
  1389. Type *Ty = Tys[i];
  1390. if (auto *VTy = dyn_cast<VectorType>(Ty)) {
  1391. if (!SkipScalarizationCost)
  1392. ScalarizationCost += getScalarizationOverhead(VTy, false, true);
  1393. ScalarCalls = std::max(ScalarCalls,
  1394. cast<FixedVectorType>(VTy)->getNumElements());
  1395. Ty = Ty->getScalarType();
  1396. }
  1397. ScalarTys.push_back(Ty);
  1398. }
  1399. if (ScalarCalls == 1)
  1400. return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
  1401. IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
  1402. InstructionCost ScalarCost =
  1403. thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
  1404. return ScalarCalls * ScalarCost + ScalarizationCost;
  1405. }
  1406. // Look for intrinsics that can be lowered directly or turned into a scalar
  1407. // intrinsic call.
  1408. case Intrinsic::sqrt:
  1409. ISDs.push_back(ISD::FSQRT);
  1410. break;
  1411. case Intrinsic::sin:
  1412. ISDs.push_back(ISD::FSIN);
  1413. break;
  1414. case Intrinsic::cos:
  1415. ISDs.push_back(ISD::FCOS);
  1416. break;
  1417. case Intrinsic::exp:
  1418. ISDs.push_back(ISD::FEXP);
  1419. break;
  1420. case Intrinsic::exp2:
  1421. ISDs.push_back(ISD::FEXP2);
  1422. break;
  1423. case Intrinsic::log:
  1424. ISDs.push_back(ISD::FLOG);
  1425. break;
  1426. case Intrinsic::log10:
  1427. ISDs.push_back(ISD::FLOG10);
  1428. break;
  1429. case Intrinsic::log2:
  1430. ISDs.push_back(ISD::FLOG2);
  1431. break;
  1432. case Intrinsic::fabs:
  1433. ISDs.push_back(ISD::FABS);
  1434. break;
  1435. case Intrinsic::canonicalize:
  1436. ISDs.push_back(ISD::FCANONICALIZE);
  1437. break;
  1438. case Intrinsic::minnum:
  1439. ISDs.push_back(ISD::FMINNUM);
  1440. break;
  1441. case Intrinsic::maxnum:
  1442. ISDs.push_back(ISD::FMAXNUM);
  1443. break;
  1444. case Intrinsic::minimum:
  1445. ISDs.push_back(ISD::FMINIMUM);
  1446. break;
  1447. case Intrinsic::maximum:
  1448. ISDs.push_back(ISD::FMAXIMUM);
  1449. break;
  1450. case Intrinsic::copysign:
  1451. ISDs.push_back(ISD::FCOPYSIGN);
  1452. break;
  1453. case Intrinsic::floor:
  1454. ISDs.push_back(ISD::FFLOOR);
  1455. break;
  1456. case Intrinsic::ceil:
  1457. ISDs.push_back(ISD::FCEIL);
  1458. break;
  1459. case Intrinsic::trunc:
  1460. ISDs.push_back(ISD::FTRUNC);
  1461. break;
  1462. case Intrinsic::nearbyint:
  1463. ISDs.push_back(ISD::FNEARBYINT);
  1464. break;
  1465. case Intrinsic::rint:
  1466. ISDs.push_back(ISD::FRINT);
  1467. break;
  1468. case Intrinsic::round:
  1469. ISDs.push_back(ISD::FROUND);
  1470. break;
  1471. case Intrinsic::roundeven:
  1472. ISDs.push_back(ISD::FROUNDEVEN);
  1473. break;
  1474. case Intrinsic::pow:
  1475. ISDs.push_back(ISD::FPOW);
  1476. break;
  1477. case Intrinsic::fma:
  1478. ISDs.push_back(ISD::FMA);
  1479. break;
  1480. case Intrinsic::fmuladd:
  1481. ISDs.push_back(ISD::FMA);
  1482. break;
  1483. case Intrinsic::experimental_constrained_fmuladd:
  1484. ISDs.push_back(ISD::STRICT_FMA);
  1485. break;
  1486. // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
  1487. case Intrinsic::lifetime_start:
  1488. case Intrinsic::lifetime_end:
  1489. case Intrinsic::sideeffect:
  1490. case Intrinsic::pseudoprobe:
  1491. case Intrinsic::arithmetic_fence:
  1492. return 0;
  1493. case Intrinsic::masked_store: {
  1494. Type *Ty = Tys[0];
  1495. Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
  1496. return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
  1497. CostKind);
  1498. }
  1499. case Intrinsic::masked_load: {
  1500. Type *Ty = RetTy;
  1501. Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
  1502. return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
  1503. CostKind);
  1504. }
  1505. case Intrinsic::vector_reduce_add:
  1506. return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
  1507. None, CostKind);
  1508. case Intrinsic::vector_reduce_mul:
  1509. return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
  1510. None, CostKind);
  1511. case Intrinsic::vector_reduce_and:
  1512. return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
  1513. None, CostKind);
  1514. case Intrinsic::vector_reduce_or:
  1515. return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, None,
  1516. CostKind);
  1517. case Intrinsic::vector_reduce_xor:
  1518. return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
  1519. None, CostKind);
  1520. case Intrinsic::vector_reduce_fadd:
  1521. return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
  1522. FMF, CostKind);
  1523. case Intrinsic::vector_reduce_fmul:
  1524. return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
  1525. FMF, CostKind);
  1526. case Intrinsic::vector_reduce_smax:
  1527. case Intrinsic::vector_reduce_smin:
  1528. case Intrinsic::vector_reduce_fmax:
  1529. case Intrinsic::vector_reduce_fmin:
  1530. return thisT()->getMinMaxReductionCost(
  1531. VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
  1532. /*IsUnsigned=*/false, CostKind);
  1533. case Intrinsic::vector_reduce_umax:
  1534. case Intrinsic::vector_reduce_umin:
  1535. return thisT()->getMinMaxReductionCost(
  1536. VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
  1537. /*IsUnsigned=*/true, CostKind);
  1538. case Intrinsic::abs: {
  1539. // abs(X) = select(icmp(X,0),X,sub(0,X))
  1540. Type *CondTy = RetTy->getWithNewBitWidth(1);
  1541. CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
  1542. InstructionCost Cost = 0;
  1543. Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
  1544. Pred, CostKind);
  1545. Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
  1546. Pred, CostKind);
  1547. // TODO: Should we add an OperandValueProperties::OP_Zero property?
  1548. Cost += thisT()->getArithmeticInstrCost(
  1549. BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue);
  1550. return Cost;
  1551. }
  1552. case Intrinsic::smax:
  1553. case Intrinsic::smin:
  1554. case Intrinsic::umax:
  1555. case Intrinsic::umin: {
  1556. // minmax(X,Y) = select(icmp(X,Y),X,Y)
  1557. Type *CondTy = RetTy->getWithNewBitWidth(1);
  1558. bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
  1559. CmpInst::Predicate Pred =
  1560. IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
  1561. InstructionCost Cost = 0;
  1562. Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
  1563. Pred, CostKind);
  1564. Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
  1565. Pred, CostKind);
  1566. return Cost;
  1567. }
  1568. case Intrinsic::sadd_sat:
  1569. case Intrinsic::ssub_sat: {
  1570. Type *CondTy = RetTy->getWithNewBitWidth(1);
  1571. Type *OpTy = StructType::create({RetTy, CondTy});
  1572. Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
  1573. ? Intrinsic::sadd_with_overflow
  1574. : Intrinsic::ssub_with_overflow;
  1575. CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
  1576. // SatMax -> Overflow && SumDiff < 0
  1577. // SatMin -> Overflow && SumDiff >= 0
  1578. InstructionCost Cost = 0;
  1579. IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
  1580. nullptr, ScalarizationCostPassed);
  1581. Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
  1582. Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
  1583. Pred, CostKind);
  1584. Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
  1585. CondTy, Pred, CostKind);
  1586. return Cost;
  1587. }
  1588. case Intrinsic::uadd_sat:
  1589. case Intrinsic::usub_sat: {
  1590. Type *CondTy = RetTy->getWithNewBitWidth(1);
  1591. Type *OpTy = StructType::create({RetTy, CondTy});
  1592. Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
  1593. ? Intrinsic::uadd_with_overflow
  1594. : Intrinsic::usub_with_overflow;
  1595. InstructionCost Cost = 0;
  1596. IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
  1597. nullptr, ScalarizationCostPassed);
  1598. Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
  1599. Cost +=
  1600. thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
  1601. CmpInst::BAD_ICMP_PREDICATE, CostKind);
  1602. return Cost;
  1603. }
  1604. case Intrinsic::smul_fix:
  1605. case Intrinsic::umul_fix: {
  1606. unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
  1607. Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
  1608. unsigned ExtOp =
  1609. IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
  1610. TTI::CastContextHint CCH = TTI::CastContextHint::None;
  1611. InstructionCost Cost = 0;
  1612. Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
  1613. Cost +=
  1614. thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
  1615. Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
  1616. CCH, CostKind);
  1617. Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
  1618. CostKind, TTI::OK_AnyValue,
  1619. TTI::OK_UniformConstantValue);
  1620. Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
  1621. TTI::OK_AnyValue,
  1622. TTI::OK_UniformConstantValue);
  1623. Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
  1624. return Cost;
  1625. }
  1626. case Intrinsic::sadd_with_overflow:
  1627. case Intrinsic::ssub_with_overflow: {
  1628. Type *SumTy = RetTy->getContainedType(0);
  1629. Type *OverflowTy = RetTy->getContainedType(1);
  1630. unsigned Opcode = IID == Intrinsic::sadd_with_overflow
  1631. ? BinaryOperator::Add
  1632. : BinaryOperator::Sub;
  1633. // Add:
  1634. // Overflow -> (Result < LHS) ^ (RHS < 0)
  1635. // Sub:
  1636. // Overflow -> (Result < LHS) ^ (RHS > 0)
  1637. InstructionCost Cost = 0;
  1638. Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
  1639. Cost += 2 * thisT()->getCmpSelInstrCost(
  1640. Instruction::ICmp, SumTy, OverflowTy,
  1641. CmpInst::ICMP_SGT, CostKind);
  1642. Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
  1643. CostKind);
  1644. return Cost;
  1645. }
  1646. case Intrinsic::uadd_with_overflow:
  1647. case Intrinsic::usub_with_overflow: {
  1648. Type *SumTy = RetTy->getContainedType(0);
  1649. Type *OverflowTy = RetTy->getContainedType(1);
  1650. unsigned Opcode = IID == Intrinsic::uadd_with_overflow
  1651. ? BinaryOperator::Add
  1652. : BinaryOperator::Sub;
  1653. CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
  1654. ? CmpInst::ICMP_ULT
  1655. : CmpInst::ICMP_UGT;
  1656. InstructionCost Cost = 0;
  1657. Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
  1658. Cost +=
  1659. thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
  1660. Pred, CostKind);
  1661. return Cost;
  1662. }
  1663. case Intrinsic::smul_with_overflow:
  1664. case Intrinsic::umul_with_overflow: {
  1665. Type *MulTy = RetTy->getContainedType(0);
  1666. Type *OverflowTy = RetTy->getContainedType(1);
  1667. unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
  1668. Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
  1669. bool IsSigned = IID == Intrinsic::smul_with_overflow;
  1670. unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
  1671. TTI::CastContextHint CCH = TTI::CastContextHint::None;
  1672. InstructionCost Cost = 0;
  1673. Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
  1674. Cost +=
  1675. thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
  1676. Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
  1677. CCH, CostKind);
  1678. Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy,
  1679. CostKind, TTI::OK_AnyValue,
  1680. TTI::OK_UniformConstantValue);
  1681. if (IsSigned)
  1682. Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
  1683. CostKind, TTI::OK_AnyValue,
  1684. TTI::OK_UniformConstantValue);
  1685. Cost += thisT()->getCmpSelInstrCost(
  1686. BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
  1687. return Cost;
  1688. }
  1689. case Intrinsic::ctpop:
  1690. ISDs.push_back(ISD::CTPOP);
  1691. // In case of legalization use TCC_Expensive. This is cheaper than a
  1692. // library call but still not a cheap instruction.
  1693. SingleCallCost = TargetTransformInfo::TCC_Expensive;
  1694. break;
  1695. case Intrinsic::ctlz:
  1696. ISDs.push_back(ISD::CTLZ);
  1697. break;
  1698. case Intrinsic::cttz:
  1699. ISDs.push_back(ISD::CTTZ);
  1700. break;
  1701. case Intrinsic::bswap:
  1702. ISDs.push_back(ISD::BSWAP);
  1703. break;
  1704. case Intrinsic::bitreverse:
  1705. ISDs.push_back(ISD::BITREVERSE);
  1706. break;
  1707. }
  1708. const TargetLoweringBase *TLI = getTLI();
  1709. std::pair<InstructionCost, MVT> LT =
  1710. TLI->getTypeLegalizationCost(DL, RetTy);
  1711. SmallVector<InstructionCost, 2> LegalCost;
  1712. SmallVector<InstructionCost, 2> CustomCost;
  1713. for (unsigned ISD : ISDs) {
  1714. if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
  1715. if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
  1716. TLI->isFAbsFree(LT.second)) {
  1717. return 0;
  1718. }
  1719. // The operation is legal. Assume it costs 1.
  1720. // If the type is split to multiple registers, assume that there is some
  1721. // overhead to this.
  1722. // TODO: Once we have extract/insert subvector cost we need to use them.
  1723. if (LT.first > 1)
  1724. LegalCost.push_back(LT.first * 2);
  1725. else
  1726. LegalCost.push_back(LT.first * 1);
  1727. } else if (!TLI->isOperationExpand(ISD, LT.second)) {
  1728. // If the operation is custom lowered then assume
  1729. // that the code is twice as expensive.
  1730. CustomCost.push_back(LT.first * 2);
  1731. }
  1732. }
  1733. auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
  1734. if (MinLegalCostI != LegalCost.end())
  1735. return *MinLegalCostI;
  1736. auto MinCustomCostI =
  1737. std::min_element(CustomCost.begin(), CustomCost.end());
  1738. if (MinCustomCostI != CustomCost.end())
  1739. return *MinCustomCostI;
  1740. // If we can't lower fmuladd into an FMA estimate the cost as a floating
  1741. // point mul followed by an add.
  1742. if (IID == Intrinsic::fmuladd)
  1743. return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
  1744. CostKind) +
  1745. thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
  1746. CostKind);
  1747. if (IID == Intrinsic::experimental_constrained_fmuladd) {
  1748. IntrinsicCostAttributes FMulAttrs(
  1749. Intrinsic::experimental_constrained_fmul, RetTy, Tys);
  1750. IntrinsicCostAttributes FAddAttrs(
  1751. Intrinsic::experimental_constrained_fadd, RetTy, Tys);
  1752. return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
  1753. thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
  1754. }
  1755. // Else, assume that we need to scalarize this intrinsic. For math builtins
  1756. // this will emit a costly libcall, adding call overhead and spills. Make it
  1757. // very expensive.
  1758. if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
  1759. // Scalable vectors cannot be scalarized, so return Invalid.
  1760. if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
  1761. return isa<ScalableVectorType>(Ty);
  1762. }))
  1763. return InstructionCost::getInvalid();
  1764. InstructionCost ScalarizationCost =
  1765. SkipScalarizationCost ? ScalarizationCostPassed
  1766. : getScalarizationOverhead(RetVTy, true, false);
  1767. unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
  1768. SmallVector<Type *, 4> ScalarTys;
  1769. for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
  1770. Type *Ty = Tys[i];
  1771. if (Ty->isVectorTy())
  1772. Ty = Ty->getScalarType();
  1773. ScalarTys.push_back(Ty);
  1774. }
  1775. IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
  1776. InstructionCost ScalarCost =
  1777. thisT()->getIntrinsicInstrCost(Attrs, CostKind);
  1778. for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
  1779. if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
  1780. if (!ICA.skipScalarizationCost())
  1781. ScalarizationCost += getScalarizationOverhead(VTy, false, true);
  1782. ScalarCalls = std::max(ScalarCalls,
  1783. cast<FixedVectorType>(VTy)->getNumElements());
  1784. }
  1785. }
  1786. return ScalarCalls * ScalarCost + ScalarizationCost;
  1787. }
  1788. // This is going to be turned into a library call, make it expensive.
  1789. return SingleCallCost;
  1790. }
  1791. /// Compute a cost of the given call instruction.
  1792. ///
  1793. /// Compute the cost of calling function F with return type RetTy and
  1794. /// argument types Tys. F might be nullptr, in this case the cost of an
  1795. /// arbitrary call with the specified signature will be returned.
  1796. /// This is used, for instance, when we estimate call of a vector
  1797. /// counterpart of the given function.
  1798. /// \param F Called function, might be nullptr.
  1799. /// \param RetTy Return value types.
  1800. /// \param Tys Argument types.
  1801. /// \returns The cost of Call instruction.
  1802. InstructionCost getCallInstrCost(Function *F, Type *RetTy,
  1803. ArrayRef<Type *> Tys,
  1804. TTI::TargetCostKind CostKind) {
  1805. return 10;
  1806. }
  1807. unsigned getNumberOfParts(Type *Tp) {
  1808. std::pair<InstructionCost, MVT> LT =
  1809. getTLI()->getTypeLegalizationCost(DL, Tp);
  1810. return LT.first.isValid() ? *LT.first.getValue() : 0;
  1811. }
  1812. InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
  1813. const SCEV *) {
  1814. return 0;
  1815. }
  1816. /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
  1817. /// We're assuming that reduction operation are performing the following way:
  1818. ///
  1819. /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
  1820. /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
  1821. /// \----------------v-------------/ \----------v------------/
  1822. /// n/2 elements n/2 elements
  1823. /// %red1 = op <n x t> %val, <n x t> val1
  1824. /// After this operation we have a vector %red1 where only the first n/2
  1825. /// elements are meaningful, the second n/2 elements are undefined and can be
  1826. /// dropped. All other operations are actually working with the vector of
  1827. /// length n/2, not n, though the real vector length is still n.
  1828. /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
  1829. /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
  1830. /// \----------------v-------------/ \----------v------------/
  1831. /// n/4 elements 3*n/4 elements
  1832. /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
  1833. /// length n/2, the resulting vector has length n/4 etc.
  1834. ///
  1835. /// The cost model should take into account that the actual length of the
  1836. /// vector is reduced on each iteration.
  1837. InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty,
  1838. TTI::TargetCostKind CostKind) {
  1839. Type *ScalarTy = Ty->getElementType();
  1840. unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
  1841. if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
  1842. ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
  1843. NumVecElts >= 2) {
  1844. // Or reduction for i1 is represented as:
  1845. // %val = bitcast <ReduxWidth x i1> to iReduxWidth
  1846. // %res = cmp ne iReduxWidth %val, 0
  1847. // And reduction for i1 is represented as:
  1848. // %val = bitcast <ReduxWidth x i1> to iReduxWidth
  1849. // %res = cmp eq iReduxWidth %val, 11111
  1850. Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
  1851. return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
  1852. TTI::CastContextHint::None, CostKind) +
  1853. thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
  1854. CmpInst::makeCmpResultType(ValTy),
  1855. CmpInst::BAD_ICMP_PREDICATE, CostKind);
  1856. }
  1857. unsigned NumReduxLevels = Log2_32(NumVecElts);
  1858. InstructionCost ArithCost = 0;
  1859. InstructionCost ShuffleCost = 0;
  1860. std::pair<InstructionCost, MVT> LT =
  1861. thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
  1862. unsigned LongVectorCount = 0;
  1863. unsigned MVTLen =
  1864. LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
  1865. while (NumVecElts > MVTLen) {
  1866. NumVecElts /= 2;
  1867. VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
  1868. ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
  1869. NumVecElts, SubTy);
  1870. ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
  1871. Ty = SubTy;
  1872. ++LongVectorCount;
  1873. }
  1874. NumReduxLevels -= LongVectorCount;
  1875. // The minimal length of the vector is limited by the real length of vector
  1876. // operations performed on the current platform. That's why several final
  1877. // reduction operations are performed on the vectors with the same
  1878. // architecture-dependent length.
  1879. // By default reductions need one shuffle per reduction level.
  1880. ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
  1881. TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
  1882. ArithCost +=
  1883. NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
  1884. return ShuffleCost + ArithCost +
  1885. thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
  1886. }
  1887. /// Try to calculate the cost of performing strict (in-order) reductions,
  1888. /// which involves doing a sequence of floating point additions in lane
  1889. /// order, starting with an initial value. For example, consider a scalar
  1890. /// initial value 'InitVal' of type float and a vector of type <4 x float>:
  1891. ///
  1892. /// Vector = <float %v0, float %v1, float %v2, float %v3>
  1893. ///
  1894. /// %add1 = %InitVal + %v0
  1895. /// %add2 = %add1 + %v1
  1896. /// %add3 = %add2 + %v2
  1897. /// %add4 = %add3 + %v3
  1898. ///
  1899. /// As a simple estimate we can say the cost of such a reduction is 4 times
  1900. /// the cost of a scalar FP addition. We can only estimate the costs for
  1901. /// fixed-width vectors here because for scalable vectors we do not know the
  1902. /// runtime number of operations.
  1903. InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty,
  1904. TTI::TargetCostKind CostKind) {
  1905. // Targets must implement a default value for the scalable case, since
  1906. // we don't know how many lanes the vector has.
  1907. if (isa<ScalableVectorType>(Ty))
  1908. return InstructionCost::getInvalid();
  1909. auto *VTy = cast<FixedVectorType>(Ty);
  1910. InstructionCost ExtractCost =
  1911. getScalarizationOverhead(VTy, /*Insert=*/false, /*Extract=*/true);
  1912. InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
  1913. Opcode, VTy->getElementType(), CostKind);
  1914. ArithCost *= VTy->getNumElements();
  1915. return ExtractCost + ArithCost;
  1916. }
  1917. InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
  1918. Optional<FastMathFlags> FMF,
  1919. TTI::TargetCostKind CostKind) {
  1920. if (TTI::requiresOrderedReduction(FMF))
  1921. return getOrderedReductionCost(Opcode, Ty, CostKind);
  1922. return getTreeReductionCost(Opcode, Ty, CostKind);
  1923. }
  1924. /// Try to calculate op costs for min/max reduction operations.
  1925. /// \param CondTy Conditional type for the Select instruction.
  1926. InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
  1927. bool IsUnsigned,
  1928. TTI::TargetCostKind CostKind) {
  1929. Type *ScalarTy = Ty->getElementType();
  1930. Type *ScalarCondTy = CondTy->getElementType();
  1931. unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
  1932. unsigned NumReduxLevels = Log2_32(NumVecElts);
  1933. unsigned CmpOpcode;
  1934. if (Ty->isFPOrFPVectorTy()) {
  1935. CmpOpcode = Instruction::FCmp;
  1936. } else {
  1937. assert(Ty->isIntOrIntVectorTy() &&
  1938. "expecting floating point or integer type for min/max reduction");
  1939. CmpOpcode = Instruction::ICmp;
  1940. }
  1941. InstructionCost MinMaxCost = 0;
  1942. InstructionCost ShuffleCost = 0;
  1943. std::pair<InstructionCost, MVT> LT =
  1944. thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
  1945. unsigned LongVectorCount = 0;
  1946. unsigned MVTLen =
  1947. LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
  1948. while (NumVecElts > MVTLen) {
  1949. NumVecElts /= 2;
  1950. auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
  1951. CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts);
  1952. ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
  1953. NumVecElts, SubTy);
  1954. MinMaxCost +=
  1955. thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy,
  1956. CmpInst::BAD_ICMP_PREDICATE, CostKind) +
  1957. thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
  1958. CmpInst::BAD_ICMP_PREDICATE, CostKind);
  1959. Ty = SubTy;
  1960. ++LongVectorCount;
  1961. }
  1962. NumReduxLevels -= LongVectorCount;
  1963. // The minimal length of the vector is limited by the real length of vector
  1964. // operations performed on the current platform. That's why several final
  1965. // reduction opertions are perfomed on the vectors with the same
  1966. // architecture-dependent length.
  1967. ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
  1968. TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
  1969. MinMaxCost +=
  1970. NumReduxLevels *
  1971. (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy,
  1972. CmpInst::BAD_ICMP_PREDICATE, CostKind) +
  1973. thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
  1974. CmpInst::BAD_ICMP_PREDICATE, CostKind));
  1975. // The last min/max should be in vector registers and we counted it above.
  1976. // So just need a single extractelement.
  1977. return ShuffleCost + MinMaxCost +
  1978. thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
  1979. }
  1980. InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
  1981. Type *ResTy, VectorType *Ty,
  1982. TTI::TargetCostKind CostKind) {
  1983. // Without any native support, this is equivalent to the cost of
  1984. // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext))
  1985. VectorType *ExtTy = VectorType::get(ResTy, Ty);
  1986. InstructionCost RedCost = thisT()->getArithmeticReductionCost(
  1987. Instruction::Add, ExtTy, None, CostKind);
  1988. InstructionCost MulCost = 0;
  1989. InstructionCost ExtCost = thisT()->getCastInstrCost(
  1990. IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
  1991. TTI::CastContextHint::None, CostKind);
  1992. if (IsMLA) {
  1993. MulCost =
  1994. thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
  1995. ExtCost *= 2;
  1996. }
  1997. return RedCost + MulCost + ExtCost;
  1998. }
  1999. InstructionCost getVectorSplitCost() { return 1; }
  2000. /// @}
  2001. };
  2002. /// Concrete BasicTTIImpl that can be used if no further customization
  2003. /// is needed.
  2004. class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
  2005. using BaseT = BasicTTIImplBase<BasicTTIImpl>;
  2006. friend class BasicTTIImplBase<BasicTTIImpl>;
  2007. const TargetSubtargetInfo *ST;
  2008. const TargetLoweringBase *TLI;
  2009. const TargetSubtargetInfo *getST() const { return ST; }
  2010. const TargetLoweringBase *getTLI() const { return TLI; }
  2011. public:
  2012. explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
  2013. };
  2014. } // end namespace llvm
  2015. #endif // LLVM_CODEGEN_BASICTTIIMPL_H
  2016. #ifdef __GNUC__
  2017. #pragma GCC diagnostic pop
  2018. #endif