RISCVTargetTransformInfo.cpp 56 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486
  1. //===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #include "RISCVTargetTransformInfo.h"
  9. #include "MCTargetDesc/RISCVMatInt.h"
  10. #include "llvm/Analysis/TargetTransformInfo.h"
  11. #include "llvm/CodeGen/BasicTTIImpl.h"
  12. #include "llvm/CodeGen/CostTable.h"
  13. #include "llvm/CodeGen/TargetLowering.h"
  14. #include <cmath>
  15. #include <optional>
  16. using namespace llvm;
  17. #define DEBUG_TYPE "riscvtti"
  18. static cl::opt<unsigned> RVVRegisterWidthLMUL(
  19. "riscv-v-register-bit-width-lmul",
  20. cl::desc(
  21. "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
  22. "by autovectorized code. Fractional LMULs are not supported."),
  23. cl::init(1), cl::Hidden);
  24. static cl::opt<unsigned> SLPMaxVF(
  25. "riscv-v-slp-max-vf",
  26. cl::desc(
  27. "Result used for getMaximumVF query which is used exclusively by "
  28. "SLP vectorizer. Defaults to 1 which disables SLP."),
  29. cl::init(1), cl::Hidden);
  30. InstructionCost RISCVTTIImpl::getLMULCost(MVT VT) {
  31. // TODO: Here assume reciprocal throughput is 1 for LMUL_1, it is
  32. // implementation-defined.
  33. if (!VT.isVector())
  34. return InstructionCost::getInvalid();
  35. unsigned Cost;
  36. if (VT.isScalableVector()) {
  37. unsigned LMul;
  38. bool Fractional;
  39. std::tie(LMul, Fractional) =
  40. RISCVVType::decodeVLMUL(RISCVTargetLowering::getLMUL(VT));
  41. if (Fractional)
  42. Cost = 1;
  43. else
  44. Cost = LMul;
  45. } else {
  46. Cost = VT.getSizeInBits() / ST->getRealMinVLen();
  47. }
  48. return std::max<unsigned>(Cost, 1);
  49. }
  50. InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
  51. TTI::TargetCostKind CostKind) {
  52. assert(Ty->isIntegerTy() &&
  53. "getIntImmCost can only estimate cost of materialising integers");
  54. // We have a Zero register, so 0 is always free.
  55. if (Imm == 0)
  56. return TTI::TCC_Free;
  57. // Otherwise, we check how many instructions it will take to materialise.
  58. const DataLayout &DL = getDataLayout();
  59. return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty),
  60. getST()->getFeatureBits());
  61. }
  62. // Look for patterns of shift followed by AND that can be turned into a pair of
  63. // shifts. We won't need to materialize an immediate for the AND so these can
  64. // be considered free.
  65. static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
  66. uint64_t Mask = Imm.getZExtValue();
  67. auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
  68. if (!BO || !BO->hasOneUse())
  69. return false;
  70. if (BO->getOpcode() != Instruction::Shl)
  71. return false;
  72. if (!isa<ConstantInt>(BO->getOperand(1)))
  73. return false;
  74. unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
  75. // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
  76. // is a mask shifted by c2 bits with c3 leading zeros.
  77. if (isShiftedMask_64(Mask)) {
  78. unsigned Trailing = countTrailingZeros(Mask);
  79. if (ShAmt == Trailing)
  80. return true;
  81. }
  82. return false;
  83. }
  84. InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
  85. const APInt &Imm, Type *Ty,
  86. TTI::TargetCostKind CostKind,
  87. Instruction *Inst) {
  88. assert(Ty->isIntegerTy() &&
  89. "getIntImmCost can only estimate cost of materialising integers");
  90. // We have a Zero register, so 0 is always free.
  91. if (Imm == 0)
  92. return TTI::TCC_Free;
  93. // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
  94. // commutative, in others the immediate comes from a specific argument index.
  95. bool Takes12BitImm = false;
  96. unsigned ImmArgIdx = ~0U;
  97. switch (Opcode) {
  98. case Instruction::GetElementPtr:
  99. // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
  100. // split up large offsets in GEP into better parts than ConstantHoisting
  101. // can.
  102. return TTI::TCC_Free;
  103. case Instruction::And:
  104. // zext.h
  105. if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
  106. return TTI::TCC_Free;
  107. // zext.w
  108. if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
  109. return TTI::TCC_Free;
  110. // bclri
  111. if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
  112. return TTI::TCC_Free;
  113. if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
  114. canUseShiftPair(Inst, Imm))
  115. return TTI::TCC_Free;
  116. Takes12BitImm = true;
  117. break;
  118. case Instruction::Add:
  119. Takes12BitImm = true;
  120. break;
  121. case Instruction::Or:
  122. case Instruction::Xor:
  123. // bseti/binvi
  124. if (ST->hasStdExtZbs() && Imm.isPowerOf2())
  125. return TTI::TCC_Free;
  126. Takes12BitImm = true;
  127. break;
  128. case Instruction::Mul:
  129. // Negated power of 2 is a shift and a negate.
  130. if (Imm.isNegatedPowerOf2())
  131. return TTI::TCC_Free;
  132. // FIXME: There is no MULI instruction.
  133. Takes12BitImm = true;
  134. break;
  135. case Instruction::Sub:
  136. case Instruction::Shl:
  137. case Instruction::LShr:
  138. case Instruction::AShr:
  139. Takes12BitImm = true;
  140. ImmArgIdx = 1;
  141. break;
  142. default:
  143. break;
  144. }
  145. if (Takes12BitImm) {
  146. // Check immediate is the correct argument...
  147. if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
  148. // ... and fits into the 12-bit immediate.
  149. if (Imm.getMinSignedBits() <= 64 &&
  150. getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
  151. return TTI::TCC_Free;
  152. }
  153. }
  154. // Otherwise, use the full materialisation cost.
  155. return getIntImmCost(Imm, Ty, CostKind);
  156. }
  157. // By default, prevent hoisting.
  158. return TTI::TCC_Free;
  159. }
  160. InstructionCost
  161. RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
  162. const APInt &Imm, Type *Ty,
  163. TTI::TargetCostKind CostKind) {
  164. // Prevent hoisting in unknown cases.
  165. return TTI::TCC_Free;
  166. }
  167. TargetTransformInfo::PopcntSupportKind
  168. RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) {
  169. assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
  170. return ST->hasStdExtZbb() ? TTI::PSK_FastHardware : TTI::PSK_Software;
  171. }
  172. bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
  173. // Currently, the ExpandReductions pass can't expand scalable-vector
  174. // reductions, but we still request expansion as RVV doesn't support certain
  175. // reductions and the SelectionDAG can't legalize them either.
  176. switch (II->getIntrinsicID()) {
  177. default:
  178. return false;
  179. // These reductions have no equivalent in RVV
  180. case Intrinsic::vector_reduce_mul:
  181. case Intrinsic::vector_reduce_fmul:
  182. return true;
  183. }
  184. }
  185. std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
  186. if (ST->hasVInstructions())
  187. return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
  188. return BaseT::getMaxVScale();
  189. }
  190. std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
  191. if (ST->hasVInstructions())
  192. if (unsigned MinVLen = ST->getRealMinVLen();
  193. MinVLen >= RISCV::RVVBitsPerBlock)
  194. return MinVLen / RISCV::RVVBitsPerBlock;
  195. return BaseT::getVScaleForTuning();
  196. }
  197. TypeSize
  198. RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
  199. unsigned LMUL = PowerOf2Floor(
  200. std::max<unsigned>(std::min<unsigned>(RVVRegisterWidthLMUL, 8), 1));
  201. switch (K) {
  202. case TargetTransformInfo::RGK_Scalar:
  203. return TypeSize::getFixed(ST->getXLen());
  204. case TargetTransformInfo::RGK_FixedWidthVector:
  205. return TypeSize::getFixed(
  206. ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
  207. case TargetTransformInfo::RGK_ScalableVector:
  208. return TypeSize::getScalable(
  209. (ST->hasVInstructions() &&
  210. ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
  211. ? LMUL * RISCV::RVVBitsPerBlock
  212. : 0);
  213. }
  214. llvm_unreachable("Unsupported register kind");
  215. }
  216. InstructionCost RISCVTTIImpl::getSpliceCost(VectorType *Tp, int Index) {
  217. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
  218. unsigned Cost = 2; // vslidedown+vslideup.
  219. // TODO: Multiplying by LT.first implies this legalizes into multiple copies
  220. // of similar code, but I think we expand through memory.
  221. return Cost * LT.first * getLMULCost(LT.second);
  222. }
  223. InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
  224. VectorType *Tp, ArrayRef<int> Mask,
  225. TTI::TargetCostKind CostKind,
  226. int Index, VectorType *SubTp,
  227. ArrayRef<const Value *> Args) {
  228. if (isa<ScalableVectorType>(Tp)) {
  229. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
  230. switch (Kind) {
  231. default:
  232. // Fallthrough to generic handling.
  233. // TODO: Most of these cases will return getInvalid in generic code, and
  234. // must be implemented here.
  235. break;
  236. case TTI::SK_Broadcast: {
  237. return LT.first * 1;
  238. }
  239. case TTI::SK_Splice:
  240. return getSpliceCost(Tp, Index);
  241. case TTI::SK_Reverse:
  242. // Most of the cost here is producing the vrgather index register
  243. // Example sequence:
  244. // csrr a0, vlenb
  245. // srli a0, a0, 3
  246. // addi a0, a0, -1
  247. // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
  248. // vid.v v9
  249. // vrsub.vx v10, v9, a0
  250. // vrgather.vv v9, v8, v10
  251. if (Tp->getElementType()->isIntegerTy(1))
  252. // Mask operation additionally required extend and truncate
  253. return LT.first * 9;
  254. return LT.first * 6;
  255. }
  256. }
  257. if (isa<FixedVectorType>(Tp) && Kind == TargetTransformInfo::SK_Broadcast) {
  258. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
  259. bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
  260. Instruction::InsertElement);
  261. if (LT.second.getScalarSizeInBits() == 1) {
  262. if (HasScalar) {
  263. // Example sequence:
  264. // andi a0, a0, 1
  265. // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
  266. // vmv.v.x v8, a0
  267. // vmsne.vi v0, v8, 0
  268. return LT.first * getLMULCost(LT.second) * 3;
  269. }
  270. // Example sequence:
  271. // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
  272. // vmv.v.i v8, 0
  273. // vmerge.vim v8, v8, 1, v0
  274. // vmv.x.s a0, v8
  275. // andi a0, a0, 1
  276. // vmv.v.x v8, a0
  277. // vmsne.vi v0, v8, 0
  278. return LT.first * getLMULCost(LT.second) * 6;
  279. }
  280. if (HasScalar) {
  281. // Example sequence:
  282. // vmv.v.x v8, a0
  283. return LT.first * getLMULCost(LT.second);
  284. }
  285. // Example sequence:
  286. // vrgather.vi v9, v8, 0
  287. // TODO: vrgather could be slower than vmv.v.x. It is
  288. // implementation-dependent.
  289. return LT.first * getLMULCost(LT.second);
  290. }
  291. return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
  292. }
  293. InstructionCost
  294. RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
  295. unsigned AddressSpace,
  296. TTI::TargetCostKind CostKind) {
  297. if (!isLegalMaskedLoadStore(Src, Alignment) ||
  298. CostKind != TTI::TCK_RecipThroughput)
  299. return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
  300. CostKind);
  301. return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
  302. }
  303. InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
  304. unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
  305. Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
  306. if (CostKind != TTI::TCK_RecipThroughput)
  307. return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
  308. Alignment, CostKind, I);
  309. if ((Opcode == Instruction::Load &&
  310. !isLegalMaskedGather(DataTy, Align(Alignment))) ||
  311. (Opcode == Instruction::Store &&
  312. !isLegalMaskedScatter(DataTy, Align(Alignment))))
  313. return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
  314. Alignment, CostKind, I);
  315. // Cost is proportional to the number of memory operations implied. For
  316. // scalable vectors, we use an estimate on that number since we don't
  317. // know exactly what VL will be.
  318. auto &VTy = *cast<VectorType>(DataTy);
  319. InstructionCost MemOpCost =
  320. getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
  321. {TTI::OK_AnyValue, TTI::OP_None}, I);
  322. unsigned NumLoads = getEstimatedVLFor(&VTy);
  323. return NumLoads * MemOpCost;
  324. }
  325. // Currently, these represent both throughput and codesize costs
  326. // for the respective intrinsics. The costs in this table are simply
  327. // instruction counts with the following adjustments made:
  328. // * One vsetvli is considered free.
  329. static const CostTblEntry VectorIntrinsicCostTable[]{
  330. {Intrinsic::floor, MVT::v2f32, 9},
  331. {Intrinsic::floor, MVT::v4f32, 9},
  332. {Intrinsic::floor, MVT::v8f32, 9},
  333. {Intrinsic::floor, MVT::v16f32, 9},
  334. {Intrinsic::floor, MVT::nxv1f32, 9},
  335. {Intrinsic::floor, MVT::nxv2f32, 9},
  336. {Intrinsic::floor, MVT::nxv4f32, 9},
  337. {Intrinsic::floor, MVT::nxv8f32, 9},
  338. {Intrinsic::floor, MVT::nxv16f32, 9},
  339. {Intrinsic::floor, MVT::v2f64, 9},
  340. {Intrinsic::floor, MVT::v4f64, 9},
  341. {Intrinsic::floor, MVT::v8f64, 9},
  342. {Intrinsic::floor, MVT::v16f64, 9},
  343. {Intrinsic::floor, MVT::nxv1f64, 9},
  344. {Intrinsic::floor, MVT::nxv2f64, 9},
  345. {Intrinsic::floor, MVT::nxv4f64, 9},
  346. {Intrinsic::floor, MVT::nxv8f64, 9},
  347. {Intrinsic::ceil, MVT::v2f32, 9},
  348. {Intrinsic::ceil, MVT::v4f32, 9},
  349. {Intrinsic::ceil, MVT::v8f32, 9},
  350. {Intrinsic::ceil, MVT::v16f32, 9},
  351. {Intrinsic::ceil, MVT::nxv1f32, 9},
  352. {Intrinsic::ceil, MVT::nxv2f32, 9},
  353. {Intrinsic::ceil, MVT::nxv4f32, 9},
  354. {Intrinsic::ceil, MVT::nxv8f32, 9},
  355. {Intrinsic::ceil, MVT::nxv16f32, 9},
  356. {Intrinsic::ceil, MVT::v2f64, 9},
  357. {Intrinsic::ceil, MVT::v4f64, 9},
  358. {Intrinsic::ceil, MVT::v8f64, 9},
  359. {Intrinsic::ceil, MVT::v16f64, 9},
  360. {Intrinsic::ceil, MVT::nxv1f64, 9},
  361. {Intrinsic::ceil, MVT::nxv2f64, 9},
  362. {Intrinsic::ceil, MVT::nxv4f64, 9},
  363. {Intrinsic::ceil, MVT::nxv8f64, 9},
  364. {Intrinsic::trunc, MVT::v2f32, 7},
  365. {Intrinsic::trunc, MVT::v4f32, 7},
  366. {Intrinsic::trunc, MVT::v8f32, 7},
  367. {Intrinsic::trunc, MVT::v16f32, 7},
  368. {Intrinsic::trunc, MVT::nxv1f32, 7},
  369. {Intrinsic::trunc, MVT::nxv2f32, 7},
  370. {Intrinsic::trunc, MVT::nxv4f32, 7},
  371. {Intrinsic::trunc, MVT::nxv8f32, 7},
  372. {Intrinsic::trunc, MVT::nxv16f32, 7},
  373. {Intrinsic::trunc, MVT::v2f64, 7},
  374. {Intrinsic::trunc, MVT::v4f64, 7},
  375. {Intrinsic::trunc, MVT::v8f64, 7},
  376. {Intrinsic::trunc, MVT::v16f64, 7},
  377. {Intrinsic::trunc, MVT::nxv1f64, 7},
  378. {Intrinsic::trunc, MVT::nxv2f64, 7},
  379. {Intrinsic::trunc, MVT::nxv4f64, 7},
  380. {Intrinsic::trunc, MVT::nxv8f64, 7},
  381. {Intrinsic::round, MVT::v2f32, 9},
  382. {Intrinsic::round, MVT::v4f32, 9},
  383. {Intrinsic::round, MVT::v8f32, 9},
  384. {Intrinsic::round, MVT::v16f32, 9},
  385. {Intrinsic::round, MVT::nxv1f32, 9},
  386. {Intrinsic::round, MVT::nxv2f32, 9},
  387. {Intrinsic::round, MVT::nxv4f32, 9},
  388. {Intrinsic::round, MVT::nxv8f32, 9},
  389. {Intrinsic::round, MVT::nxv16f32, 9},
  390. {Intrinsic::round, MVT::v2f64, 9},
  391. {Intrinsic::round, MVT::v4f64, 9},
  392. {Intrinsic::round, MVT::v8f64, 9},
  393. {Intrinsic::round, MVT::v16f64, 9},
  394. {Intrinsic::round, MVT::nxv1f64, 9},
  395. {Intrinsic::round, MVT::nxv2f64, 9},
  396. {Intrinsic::round, MVT::nxv4f64, 9},
  397. {Intrinsic::round, MVT::nxv8f64, 9},
  398. {Intrinsic::roundeven, MVT::v2f32, 9},
  399. {Intrinsic::roundeven, MVT::v4f32, 9},
  400. {Intrinsic::roundeven, MVT::v8f32, 9},
  401. {Intrinsic::roundeven, MVT::v16f32, 9},
  402. {Intrinsic::roundeven, MVT::nxv1f32, 9},
  403. {Intrinsic::roundeven, MVT::nxv2f32, 9},
  404. {Intrinsic::roundeven, MVT::nxv4f32, 9},
  405. {Intrinsic::roundeven, MVT::nxv8f32, 9},
  406. {Intrinsic::roundeven, MVT::nxv16f32, 9},
  407. {Intrinsic::roundeven, MVT::v2f64, 9},
  408. {Intrinsic::roundeven, MVT::v4f64, 9},
  409. {Intrinsic::roundeven, MVT::v8f64, 9},
  410. {Intrinsic::roundeven, MVT::v16f64, 9},
  411. {Intrinsic::roundeven, MVT::nxv1f64, 9},
  412. {Intrinsic::roundeven, MVT::nxv2f64, 9},
  413. {Intrinsic::roundeven, MVT::nxv4f64, 9},
  414. {Intrinsic::roundeven, MVT::nxv8f64, 9},
  415. {Intrinsic::bswap, MVT::v2i16, 3},
  416. {Intrinsic::bswap, MVT::v4i16, 3},
  417. {Intrinsic::bswap, MVT::v8i16, 3},
  418. {Intrinsic::bswap, MVT::v16i16, 3},
  419. {Intrinsic::bswap, MVT::nxv1i16, 3},
  420. {Intrinsic::bswap, MVT::nxv2i16, 3},
  421. {Intrinsic::bswap, MVT::nxv4i16, 3},
  422. {Intrinsic::bswap, MVT::nxv8i16, 3},
  423. {Intrinsic::bswap, MVT::nxv16i16, 3},
  424. {Intrinsic::bswap, MVT::v2i32, 12},
  425. {Intrinsic::bswap, MVT::v4i32, 12},
  426. {Intrinsic::bswap, MVT::v8i32, 12},
  427. {Intrinsic::bswap, MVT::v16i32, 12},
  428. {Intrinsic::bswap, MVT::nxv1i32, 12},
  429. {Intrinsic::bswap, MVT::nxv2i32, 12},
  430. {Intrinsic::bswap, MVT::nxv4i32, 12},
  431. {Intrinsic::bswap, MVT::nxv8i32, 12},
  432. {Intrinsic::bswap, MVT::nxv16i32, 12},
  433. {Intrinsic::bswap, MVT::v2i64, 31},
  434. {Intrinsic::bswap, MVT::v4i64, 31},
  435. {Intrinsic::bswap, MVT::v8i64, 31},
  436. {Intrinsic::bswap, MVT::v16i64, 31},
  437. {Intrinsic::bswap, MVT::nxv1i64, 31},
  438. {Intrinsic::bswap, MVT::nxv2i64, 31},
  439. {Intrinsic::bswap, MVT::nxv4i64, 31},
  440. {Intrinsic::bswap, MVT::nxv8i64, 31},
  441. {Intrinsic::vp_bswap, MVT::v2i16, 3},
  442. {Intrinsic::vp_bswap, MVT::v4i16, 3},
  443. {Intrinsic::vp_bswap, MVT::v8i16, 3},
  444. {Intrinsic::vp_bswap, MVT::v16i16, 3},
  445. {Intrinsic::vp_bswap, MVT::nxv1i16, 3},
  446. {Intrinsic::vp_bswap, MVT::nxv2i16, 3},
  447. {Intrinsic::vp_bswap, MVT::nxv4i16, 3},
  448. {Intrinsic::vp_bswap, MVT::nxv8i16, 3},
  449. {Intrinsic::vp_bswap, MVT::nxv16i16, 3},
  450. {Intrinsic::vp_bswap, MVT::v2i32, 12},
  451. {Intrinsic::vp_bswap, MVT::v4i32, 12},
  452. {Intrinsic::vp_bswap, MVT::v8i32, 12},
  453. {Intrinsic::vp_bswap, MVT::v16i32, 12},
  454. {Intrinsic::vp_bswap, MVT::nxv1i32, 12},
  455. {Intrinsic::vp_bswap, MVT::nxv2i32, 12},
  456. {Intrinsic::vp_bswap, MVT::nxv4i32, 12},
  457. {Intrinsic::vp_bswap, MVT::nxv8i32, 12},
  458. {Intrinsic::vp_bswap, MVT::nxv16i32, 12},
  459. {Intrinsic::vp_bswap, MVT::v2i64, 31},
  460. {Intrinsic::vp_bswap, MVT::v4i64, 31},
  461. {Intrinsic::vp_bswap, MVT::v8i64, 31},
  462. {Intrinsic::vp_bswap, MVT::v16i64, 31},
  463. {Intrinsic::vp_bswap, MVT::nxv1i64, 31},
  464. {Intrinsic::vp_bswap, MVT::nxv2i64, 31},
  465. {Intrinsic::vp_bswap, MVT::nxv4i64, 31},
  466. {Intrinsic::vp_bswap, MVT::nxv8i64, 31},
  467. {Intrinsic::vp_fshl, MVT::v2i8, 7},
  468. {Intrinsic::vp_fshl, MVT::v4i8, 7},
  469. {Intrinsic::vp_fshl, MVT::v8i8, 7},
  470. {Intrinsic::vp_fshl, MVT::v16i8, 7},
  471. {Intrinsic::vp_fshl, MVT::nxv1i8, 7},
  472. {Intrinsic::vp_fshl, MVT::nxv2i8, 7},
  473. {Intrinsic::vp_fshl, MVT::nxv4i8, 7},
  474. {Intrinsic::vp_fshl, MVT::nxv8i8, 7},
  475. {Intrinsic::vp_fshl, MVT::nxv16i8, 7},
  476. {Intrinsic::vp_fshl, MVT::nxv32i8, 7},
  477. {Intrinsic::vp_fshl, MVT::nxv64i8, 7},
  478. {Intrinsic::vp_fshl, MVT::v2i16, 7},
  479. {Intrinsic::vp_fshl, MVT::v4i16, 7},
  480. {Intrinsic::vp_fshl, MVT::v8i16, 7},
  481. {Intrinsic::vp_fshl, MVT::v16i16, 7},
  482. {Intrinsic::vp_fshl, MVT::nxv1i16, 7},
  483. {Intrinsic::vp_fshl, MVT::nxv2i16, 7},
  484. {Intrinsic::vp_fshl, MVT::nxv4i16, 7},
  485. {Intrinsic::vp_fshl, MVT::nxv8i16, 7},
  486. {Intrinsic::vp_fshl, MVT::nxv16i16, 7},
  487. {Intrinsic::vp_fshl, MVT::nxv32i16, 7},
  488. {Intrinsic::vp_fshl, MVT::v2i32, 7},
  489. {Intrinsic::vp_fshl, MVT::v4i32, 7},
  490. {Intrinsic::vp_fshl, MVT::v8i32, 7},
  491. {Intrinsic::vp_fshl, MVT::v16i32, 7},
  492. {Intrinsic::vp_fshl, MVT::nxv1i32, 7},
  493. {Intrinsic::vp_fshl, MVT::nxv2i32, 7},
  494. {Intrinsic::vp_fshl, MVT::nxv4i32, 7},
  495. {Intrinsic::vp_fshl, MVT::nxv8i32, 7},
  496. {Intrinsic::vp_fshl, MVT::nxv16i32, 7},
  497. {Intrinsic::vp_fshl, MVT::v2i64, 7},
  498. {Intrinsic::vp_fshl, MVT::v4i64, 7},
  499. {Intrinsic::vp_fshl, MVT::v8i64, 7},
  500. {Intrinsic::vp_fshl, MVT::v16i64, 7},
  501. {Intrinsic::vp_fshl, MVT::nxv1i64, 7},
  502. {Intrinsic::vp_fshl, MVT::nxv2i64, 7},
  503. {Intrinsic::vp_fshl, MVT::nxv4i64, 7},
  504. {Intrinsic::vp_fshl, MVT::nxv8i64, 7},
  505. {Intrinsic::vp_fshr, MVT::v2i8, 7},
  506. {Intrinsic::vp_fshr, MVT::v4i8, 7},
  507. {Intrinsic::vp_fshr, MVT::v8i8, 7},
  508. {Intrinsic::vp_fshr, MVT::v16i8, 7},
  509. {Intrinsic::vp_fshr, MVT::nxv1i8, 7},
  510. {Intrinsic::vp_fshr, MVT::nxv2i8, 7},
  511. {Intrinsic::vp_fshr, MVT::nxv4i8, 7},
  512. {Intrinsic::vp_fshr, MVT::nxv8i8, 7},
  513. {Intrinsic::vp_fshr, MVT::nxv16i8, 7},
  514. {Intrinsic::vp_fshr, MVT::nxv32i8, 7},
  515. {Intrinsic::vp_fshr, MVT::nxv64i8, 7},
  516. {Intrinsic::vp_fshr, MVT::v2i16, 7},
  517. {Intrinsic::vp_fshr, MVT::v4i16, 7},
  518. {Intrinsic::vp_fshr, MVT::v8i16, 7},
  519. {Intrinsic::vp_fshr, MVT::v16i16, 7},
  520. {Intrinsic::vp_fshr, MVT::nxv1i16, 7},
  521. {Intrinsic::vp_fshr, MVT::nxv2i16, 7},
  522. {Intrinsic::vp_fshr, MVT::nxv4i16, 7},
  523. {Intrinsic::vp_fshr, MVT::nxv8i16, 7},
  524. {Intrinsic::vp_fshr, MVT::nxv16i16, 7},
  525. {Intrinsic::vp_fshr, MVT::nxv32i16, 7},
  526. {Intrinsic::vp_fshr, MVT::v2i32, 7},
  527. {Intrinsic::vp_fshr, MVT::v4i32, 7},
  528. {Intrinsic::vp_fshr, MVT::v8i32, 7},
  529. {Intrinsic::vp_fshr, MVT::v16i32, 7},
  530. {Intrinsic::vp_fshr, MVT::nxv1i32, 7},
  531. {Intrinsic::vp_fshr, MVT::nxv2i32, 7},
  532. {Intrinsic::vp_fshr, MVT::nxv4i32, 7},
  533. {Intrinsic::vp_fshr, MVT::nxv8i32, 7},
  534. {Intrinsic::vp_fshr, MVT::nxv16i32, 7},
  535. {Intrinsic::vp_fshr, MVT::v2i64, 7},
  536. {Intrinsic::vp_fshr, MVT::v4i64, 7},
  537. {Intrinsic::vp_fshr, MVT::v8i64, 7},
  538. {Intrinsic::vp_fshr, MVT::v16i64, 7},
  539. {Intrinsic::vp_fshr, MVT::nxv1i64, 7},
  540. {Intrinsic::vp_fshr, MVT::nxv2i64, 7},
  541. {Intrinsic::vp_fshr, MVT::nxv4i64, 7},
  542. {Intrinsic::vp_fshr, MVT::nxv8i64, 7},
  543. {Intrinsic::bitreverse, MVT::v2i8, 17},
  544. {Intrinsic::bitreverse, MVT::v4i8, 17},
  545. {Intrinsic::bitreverse, MVT::v8i8, 17},
  546. {Intrinsic::bitreverse, MVT::v16i8, 17},
  547. {Intrinsic::bitreverse, MVT::nxv1i8, 17},
  548. {Intrinsic::bitreverse, MVT::nxv2i8, 17},
  549. {Intrinsic::bitreverse, MVT::nxv4i8, 17},
  550. {Intrinsic::bitreverse, MVT::nxv8i8, 17},
  551. {Intrinsic::bitreverse, MVT::nxv16i8, 17},
  552. {Intrinsic::bitreverse, MVT::v2i16, 24},
  553. {Intrinsic::bitreverse, MVT::v4i16, 24},
  554. {Intrinsic::bitreverse, MVT::v8i16, 24},
  555. {Intrinsic::bitreverse, MVT::v16i16, 24},
  556. {Intrinsic::bitreverse, MVT::nxv1i16, 24},
  557. {Intrinsic::bitreverse, MVT::nxv2i16, 24},
  558. {Intrinsic::bitreverse, MVT::nxv4i16, 24},
  559. {Intrinsic::bitreverse, MVT::nxv8i16, 24},
  560. {Intrinsic::bitreverse, MVT::nxv16i16, 24},
  561. {Intrinsic::bitreverse, MVT::v2i32, 33},
  562. {Intrinsic::bitreverse, MVT::v4i32, 33},
  563. {Intrinsic::bitreverse, MVT::v8i32, 33},
  564. {Intrinsic::bitreverse, MVT::v16i32, 33},
  565. {Intrinsic::bitreverse, MVT::nxv1i32, 33},
  566. {Intrinsic::bitreverse, MVT::nxv2i32, 33},
  567. {Intrinsic::bitreverse, MVT::nxv4i32, 33},
  568. {Intrinsic::bitreverse, MVT::nxv8i32, 33},
  569. {Intrinsic::bitreverse, MVT::nxv16i32, 33},
  570. {Intrinsic::bitreverse, MVT::v2i64, 52},
  571. {Intrinsic::bitreverse, MVT::v4i64, 52},
  572. {Intrinsic::bitreverse, MVT::v8i64, 52},
  573. {Intrinsic::bitreverse, MVT::v16i64, 52},
  574. {Intrinsic::bitreverse, MVT::nxv1i64, 52},
  575. {Intrinsic::bitreverse, MVT::nxv2i64, 52},
  576. {Intrinsic::bitreverse, MVT::nxv4i64, 52},
  577. {Intrinsic::bitreverse, MVT::nxv8i64, 52},
  578. {Intrinsic::vp_bitreverse, MVT::v2i8, 17},
  579. {Intrinsic::vp_bitreverse, MVT::v4i8, 17},
  580. {Intrinsic::vp_bitreverse, MVT::v8i8, 17},
  581. {Intrinsic::vp_bitreverse, MVT::v16i8, 17},
  582. {Intrinsic::vp_bitreverse, MVT::nxv1i8, 17},
  583. {Intrinsic::vp_bitreverse, MVT::nxv2i8, 17},
  584. {Intrinsic::vp_bitreverse, MVT::nxv4i8, 17},
  585. {Intrinsic::vp_bitreverse, MVT::nxv8i8, 17},
  586. {Intrinsic::vp_bitreverse, MVT::nxv16i8, 17},
  587. {Intrinsic::vp_bitreverse, MVT::v2i16, 24},
  588. {Intrinsic::vp_bitreverse, MVT::v4i16, 24},
  589. {Intrinsic::vp_bitreverse, MVT::v8i16, 24},
  590. {Intrinsic::vp_bitreverse, MVT::v16i16, 24},
  591. {Intrinsic::vp_bitreverse, MVT::nxv1i16, 24},
  592. {Intrinsic::vp_bitreverse, MVT::nxv2i16, 24},
  593. {Intrinsic::vp_bitreverse, MVT::nxv4i16, 24},
  594. {Intrinsic::vp_bitreverse, MVT::nxv8i16, 24},
  595. {Intrinsic::vp_bitreverse, MVT::nxv16i16, 24},
  596. {Intrinsic::vp_bitreverse, MVT::v2i32, 33},
  597. {Intrinsic::vp_bitreverse, MVT::v4i32, 33},
  598. {Intrinsic::vp_bitreverse, MVT::v8i32, 33},
  599. {Intrinsic::vp_bitreverse, MVT::v16i32, 33},
  600. {Intrinsic::vp_bitreverse, MVT::nxv1i32, 33},
  601. {Intrinsic::vp_bitreverse, MVT::nxv2i32, 33},
  602. {Intrinsic::vp_bitreverse, MVT::nxv4i32, 33},
  603. {Intrinsic::vp_bitreverse, MVT::nxv8i32, 33},
  604. {Intrinsic::vp_bitreverse, MVT::nxv16i32, 33},
  605. {Intrinsic::vp_bitreverse, MVT::v2i64, 52},
  606. {Intrinsic::vp_bitreverse, MVT::v4i64, 52},
  607. {Intrinsic::vp_bitreverse, MVT::v8i64, 52},
  608. {Intrinsic::vp_bitreverse, MVT::v16i64, 52},
  609. {Intrinsic::vp_bitreverse, MVT::nxv1i64, 52},
  610. {Intrinsic::vp_bitreverse, MVT::nxv2i64, 52},
  611. {Intrinsic::vp_bitreverse, MVT::nxv4i64, 52},
  612. {Intrinsic::vp_bitreverse, MVT::nxv8i64, 52},
  613. {Intrinsic::ctpop, MVT::v2i8, 12},
  614. {Intrinsic::ctpop, MVT::v4i8, 12},
  615. {Intrinsic::ctpop, MVT::v8i8, 12},
  616. {Intrinsic::ctpop, MVT::v16i8, 12},
  617. {Intrinsic::ctpop, MVT::nxv1i8, 12},
  618. {Intrinsic::ctpop, MVT::nxv2i8, 12},
  619. {Intrinsic::ctpop, MVT::nxv4i8, 12},
  620. {Intrinsic::ctpop, MVT::nxv8i8, 12},
  621. {Intrinsic::ctpop, MVT::nxv16i8, 12},
  622. {Intrinsic::ctpop, MVT::v2i16, 19},
  623. {Intrinsic::ctpop, MVT::v4i16, 19},
  624. {Intrinsic::ctpop, MVT::v8i16, 19},
  625. {Intrinsic::ctpop, MVT::v16i16, 19},
  626. {Intrinsic::ctpop, MVT::nxv1i16, 19},
  627. {Intrinsic::ctpop, MVT::nxv2i16, 19},
  628. {Intrinsic::ctpop, MVT::nxv4i16, 19},
  629. {Intrinsic::ctpop, MVT::nxv8i16, 19},
  630. {Intrinsic::ctpop, MVT::nxv16i16, 19},
  631. {Intrinsic::ctpop, MVT::v2i32, 20},
  632. {Intrinsic::ctpop, MVT::v4i32, 20},
  633. {Intrinsic::ctpop, MVT::v8i32, 20},
  634. {Intrinsic::ctpop, MVT::v16i32, 20},
  635. {Intrinsic::ctpop, MVT::nxv1i32, 20},
  636. {Intrinsic::ctpop, MVT::nxv2i32, 20},
  637. {Intrinsic::ctpop, MVT::nxv4i32, 20},
  638. {Intrinsic::ctpop, MVT::nxv8i32, 20},
  639. {Intrinsic::ctpop, MVT::nxv16i32, 20},
  640. {Intrinsic::ctpop, MVT::v2i64, 21},
  641. {Intrinsic::ctpop, MVT::v4i64, 21},
  642. {Intrinsic::ctpop, MVT::v8i64, 21},
  643. {Intrinsic::ctpop, MVT::v16i64, 21},
  644. {Intrinsic::ctpop, MVT::nxv1i64, 21},
  645. {Intrinsic::ctpop, MVT::nxv2i64, 21},
  646. {Intrinsic::ctpop, MVT::nxv4i64, 21},
  647. {Intrinsic::ctpop, MVT::nxv8i64, 21},
  648. {Intrinsic::vp_ctpop, MVT::v2i8, 12},
  649. {Intrinsic::vp_ctpop, MVT::v4i8, 12},
  650. {Intrinsic::vp_ctpop, MVT::v8i8, 12},
  651. {Intrinsic::vp_ctpop, MVT::v16i8, 12},
  652. {Intrinsic::vp_ctpop, MVT::nxv1i8, 12},
  653. {Intrinsic::vp_ctpop, MVT::nxv2i8, 12},
  654. {Intrinsic::vp_ctpop, MVT::nxv4i8, 12},
  655. {Intrinsic::vp_ctpop, MVT::nxv8i8, 12},
  656. {Intrinsic::vp_ctpop, MVT::nxv16i8, 12},
  657. {Intrinsic::vp_ctpop, MVT::v2i16, 19},
  658. {Intrinsic::vp_ctpop, MVT::v4i16, 19},
  659. {Intrinsic::vp_ctpop, MVT::v8i16, 19},
  660. {Intrinsic::vp_ctpop, MVT::v16i16, 19},
  661. {Intrinsic::vp_ctpop, MVT::nxv1i16, 19},
  662. {Intrinsic::vp_ctpop, MVT::nxv2i16, 19},
  663. {Intrinsic::vp_ctpop, MVT::nxv4i16, 19},
  664. {Intrinsic::vp_ctpop, MVT::nxv8i16, 19},
  665. {Intrinsic::vp_ctpop, MVT::nxv16i16, 19},
  666. {Intrinsic::vp_ctpop, MVT::v2i32, 20},
  667. {Intrinsic::vp_ctpop, MVT::v4i32, 20},
  668. {Intrinsic::vp_ctpop, MVT::v8i32, 20},
  669. {Intrinsic::vp_ctpop, MVT::v16i32, 20},
  670. {Intrinsic::vp_ctpop, MVT::nxv1i32, 20},
  671. {Intrinsic::vp_ctpop, MVT::nxv2i32, 20},
  672. {Intrinsic::vp_ctpop, MVT::nxv4i32, 20},
  673. {Intrinsic::vp_ctpop, MVT::nxv8i32, 20},
  674. {Intrinsic::vp_ctpop, MVT::nxv16i32, 20},
  675. {Intrinsic::vp_ctpop, MVT::v2i64, 21},
  676. {Intrinsic::vp_ctpop, MVT::v4i64, 21},
  677. {Intrinsic::vp_ctpop, MVT::v8i64, 21},
  678. {Intrinsic::vp_ctpop, MVT::v16i64, 21},
  679. {Intrinsic::vp_ctpop, MVT::nxv1i64, 21},
  680. {Intrinsic::vp_ctpop, MVT::nxv2i64, 21},
  681. {Intrinsic::vp_ctpop, MVT::nxv4i64, 21},
  682. {Intrinsic::vp_ctpop, MVT::nxv8i64, 21},
  683. {Intrinsic::vp_ctlz, MVT::v2i8, 19},
  684. {Intrinsic::vp_ctlz, MVT::v4i8, 19},
  685. {Intrinsic::vp_ctlz, MVT::v8i8, 19},
  686. {Intrinsic::vp_ctlz, MVT::v16i8, 19},
  687. {Intrinsic::vp_ctlz, MVT::nxv1i8, 19},
  688. {Intrinsic::vp_ctlz, MVT::nxv2i8, 19},
  689. {Intrinsic::vp_ctlz, MVT::nxv4i8, 19},
  690. {Intrinsic::vp_ctlz, MVT::nxv8i8, 19},
  691. {Intrinsic::vp_ctlz, MVT::nxv16i8, 19},
  692. {Intrinsic::vp_ctlz, MVT::nxv32i8, 19},
  693. {Intrinsic::vp_ctlz, MVT::nxv64i8, 19},
  694. {Intrinsic::vp_ctlz, MVT::v2i16, 28},
  695. {Intrinsic::vp_ctlz, MVT::v4i16, 28},
  696. {Intrinsic::vp_ctlz, MVT::v8i16, 28},
  697. {Intrinsic::vp_ctlz, MVT::v16i16, 28},
  698. {Intrinsic::vp_ctlz, MVT::nxv1i16, 28},
  699. {Intrinsic::vp_ctlz, MVT::nxv2i16, 28},
  700. {Intrinsic::vp_ctlz, MVT::nxv4i16, 28},
  701. {Intrinsic::vp_ctlz, MVT::nxv8i16, 28},
  702. {Intrinsic::vp_ctlz, MVT::nxv16i16, 28},
  703. {Intrinsic::vp_ctlz, MVT::nxv32i16, 28},
  704. {Intrinsic::vp_ctlz, MVT::v2i32, 31},
  705. {Intrinsic::vp_ctlz, MVT::v4i32, 31},
  706. {Intrinsic::vp_ctlz, MVT::v8i32, 31},
  707. {Intrinsic::vp_ctlz, MVT::v16i32, 31},
  708. {Intrinsic::vp_ctlz, MVT::nxv1i32, 31},
  709. {Intrinsic::vp_ctlz, MVT::nxv2i32, 31},
  710. {Intrinsic::vp_ctlz, MVT::nxv4i32, 31},
  711. {Intrinsic::vp_ctlz, MVT::nxv8i32, 31},
  712. {Intrinsic::vp_ctlz, MVT::nxv16i32, 31},
  713. {Intrinsic::vp_ctlz, MVT::v2i64, 35},
  714. {Intrinsic::vp_ctlz, MVT::v4i64, 35},
  715. {Intrinsic::vp_ctlz, MVT::v8i64, 35},
  716. {Intrinsic::vp_ctlz, MVT::v16i64, 35},
  717. {Intrinsic::vp_ctlz, MVT::nxv1i64, 35},
  718. {Intrinsic::vp_ctlz, MVT::nxv2i64, 35},
  719. {Intrinsic::vp_ctlz, MVT::nxv4i64, 35},
  720. {Intrinsic::vp_ctlz, MVT::nxv8i64, 35},
  721. {Intrinsic::vp_cttz, MVT::v2i8, 16},
  722. {Intrinsic::vp_cttz, MVT::v4i8, 16},
  723. {Intrinsic::vp_cttz, MVT::v8i8, 16},
  724. {Intrinsic::vp_cttz, MVT::v16i8, 16},
  725. {Intrinsic::vp_cttz, MVT::nxv1i8, 16},
  726. {Intrinsic::vp_cttz, MVT::nxv2i8, 16},
  727. {Intrinsic::vp_cttz, MVT::nxv4i8, 16},
  728. {Intrinsic::vp_cttz, MVT::nxv8i8, 16},
  729. {Intrinsic::vp_cttz, MVT::nxv16i8, 16},
  730. {Intrinsic::vp_cttz, MVT::nxv32i8, 16},
  731. {Intrinsic::vp_cttz, MVT::nxv64i8, 16},
  732. {Intrinsic::vp_cttz, MVT::v2i16, 23},
  733. {Intrinsic::vp_cttz, MVT::v4i16, 23},
  734. {Intrinsic::vp_cttz, MVT::v8i16, 23},
  735. {Intrinsic::vp_cttz, MVT::v16i16, 23},
  736. {Intrinsic::vp_cttz, MVT::nxv1i16, 23},
  737. {Intrinsic::vp_cttz, MVT::nxv2i16, 23},
  738. {Intrinsic::vp_cttz, MVT::nxv4i16, 23},
  739. {Intrinsic::vp_cttz, MVT::nxv8i16, 23},
  740. {Intrinsic::vp_cttz, MVT::nxv16i16, 23},
  741. {Intrinsic::vp_cttz, MVT::nxv32i16, 23},
  742. {Intrinsic::vp_cttz, MVT::v2i32, 24},
  743. {Intrinsic::vp_cttz, MVT::v4i32, 24},
  744. {Intrinsic::vp_cttz, MVT::v8i32, 24},
  745. {Intrinsic::vp_cttz, MVT::v16i32, 24},
  746. {Intrinsic::vp_cttz, MVT::nxv1i32, 24},
  747. {Intrinsic::vp_cttz, MVT::nxv2i32, 24},
  748. {Intrinsic::vp_cttz, MVT::nxv4i32, 24},
  749. {Intrinsic::vp_cttz, MVT::nxv8i32, 24},
  750. {Intrinsic::vp_cttz, MVT::nxv16i32, 24},
  751. {Intrinsic::vp_cttz, MVT::v2i64, 25},
  752. {Intrinsic::vp_cttz, MVT::v4i64, 25},
  753. {Intrinsic::vp_cttz, MVT::v8i64, 25},
  754. {Intrinsic::vp_cttz, MVT::v16i64, 25},
  755. {Intrinsic::vp_cttz, MVT::nxv1i64, 25},
  756. {Intrinsic::vp_cttz, MVT::nxv2i64, 25},
  757. {Intrinsic::vp_cttz, MVT::nxv4i64, 25},
  758. {Intrinsic::vp_cttz, MVT::nxv8i64, 25},
  759. };
  760. static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) {
  761. switch (ID) {
  762. #define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
  763. case Intrinsic::VPID: \
  764. return ISD::VPSD;
  765. #include "llvm/IR/VPIntrinsics.def"
  766. #undef HELPER_MAP_VPID_TO_VPSD
  767. }
  768. return ISD::DELETED_NODE;
  769. }
  770. InstructionCost
  771. RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
  772. TTI::TargetCostKind CostKind) {
  773. auto *RetTy = ICA.getReturnType();
  774. switch (ICA.getID()) {
  775. case Intrinsic::ceil:
  776. case Intrinsic::floor:
  777. case Intrinsic::trunc:
  778. case Intrinsic::rint:
  779. case Intrinsic::round:
  780. case Intrinsic::roundeven: {
  781. // These all use the same code.
  782. auto LT = getTypeLegalizationCost(RetTy);
  783. if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
  784. return LT.first * 8;
  785. break;
  786. }
  787. case Intrinsic::umin:
  788. case Intrinsic::umax:
  789. case Intrinsic::smin:
  790. case Intrinsic::smax: {
  791. auto LT = getTypeLegalizationCost(RetTy);
  792. if ((ST->hasVInstructions() && LT.second.isVector()) ||
  793. (LT.second.isScalarInteger() && ST->hasStdExtZbb()))
  794. return LT.first;
  795. break;
  796. }
  797. case Intrinsic::sadd_sat:
  798. case Intrinsic::ssub_sat:
  799. case Intrinsic::uadd_sat:
  800. case Intrinsic::usub_sat: {
  801. auto LT = getTypeLegalizationCost(RetTy);
  802. if (ST->hasVInstructions() && LT.second.isVector())
  803. return LT.first;
  804. break;
  805. }
  806. case Intrinsic::abs: {
  807. auto LT = getTypeLegalizationCost(RetTy);
  808. if (ST->hasVInstructions() && LT.second.isVector()) {
  809. // vrsub.vi v10, v8, 0
  810. // vmax.vv v8, v8, v10
  811. return LT.first * 2;
  812. }
  813. break;
  814. }
  815. case Intrinsic::fabs:
  816. case Intrinsic::sqrt: {
  817. auto LT = getTypeLegalizationCost(RetTy);
  818. if (ST->hasVInstructions() && LT.second.isVector())
  819. return LT.first;
  820. break;
  821. }
  822. // TODO: add more intrinsic
  823. case Intrinsic::experimental_stepvector: {
  824. unsigned Cost = 1; // vid
  825. auto LT = getTypeLegalizationCost(RetTy);
  826. return Cost + (LT.first - 1);
  827. }
  828. case Intrinsic::vp_rint: {
  829. // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
  830. unsigned Cost = 5;
  831. auto LT = getTypeLegalizationCost(RetTy);
  832. if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
  833. return Cost * LT.first;
  834. break;
  835. }
  836. case Intrinsic::vp_nearbyint: {
  837. // More one read and one write for fflags than vp_rint.
  838. unsigned Cost = 7;
  839. auto LT = getTypeLegalizationCost(RetTy);
  840. if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
  841. return Cost * LT.first;
  842. break;
  843. }
  844. case Intrinsic::vp_ceil:
  845. case Intrinsic::vp_floor:
  846. case Intrinsic::vp_round:
  847. case Intrinsic::vp_roundeven:
  848. case Intrinsic::vp_roundtozero: {
  849. // Rounding with static rounding mode needs two more instructions to
  850. // swap/write FRM than vp_rint.
  851. unsigned Cost = 7;
  852. auto LT = getTypeLegalizationCost(RetTy);
  853. unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
  854. if (TLI->isOperationCustom(VPISD, LT.second))
  855. return Cost * LT.first;
  856. break;
  857. }
  858. }
  859. if (ST->hasVInstructions() && RetTy->isVectorTy()) {
  860. auto LT = getTypeLegalizationCost(RetTy);
  861. if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
  862. ICA.getID(), LT.second))
  863. return LT.first * Entry->Cost;
  864. }
  865. return BaseT::getIntrinsicInstrCost(ICA, CostKind);
  866. }
  867. InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
  868. Type *Src,
  869. TTI::CastContextHint CCH,
  870. TTI::TargetCostKind CostKind,
  871. const Instruction *I) {
  872. if (isa<VectorType>(Dst) && isa<VectorType>(Src)) {
  873. // FIXME: Need to compute legalizing cost for illegal types.
  874. if (!isTypeLegal(Src) || !isTypeLegal(Dst))
  875. return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
  876. // Skip if element size of Dst or Src is bigger than ELEN.
  877. if (Src->getScalarSizeInBits() > ST->getELEN() ||
  878. Dst->getScalarSizeInBits() > ST->getELEN())
  879. return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
  880. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  881. assert(ISD && "Invalid opcode");
  882. // FIXME: Need to consider vsetvli and lmul.
  883. int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
  884. (int)Log2_32(Src->getScalarSizeInBits());
  885. switch (ISD) {
  886. case ISD::SIGN_EXTEND:
  887. case ISD::ZERO_EXTEND:
  888. if (Src->getScalarSizeInBits() == 1) {
  889. // We do not use vsext/vzext to extend from mask vector.
  890. // Instead we use the following instructions to extend from mask vector:
  891. // vmv.v.i v8, 0
  892. // vmerge.vim v8, v8, -1, v0
  893. return 2;
  894. }
  895. return 1;
  896. case ISD::TRUNCATE:
  897. if (Dst->getScalarSizeInBits() == 1) {
  898. // We do not use several vncvt to truncate to mask vector. So we could
  899. // not use PowDiff to calculate it.
  900. // Instead we use the following instructions to truncate to mask vector:
  901. // vand.vi v8, v8, 1
  902. // vmsne.vi v0, v8, 0
  903. return 2;
  904. }
  905. [[fallthrough]];
  906. case ISD::FP_EXTEND:
  907. case ISD::FP_ROUND:
  908. // Counts of narrow/widen instructions.
  909. return std::abs(PowDiff);
  910. case ISD::FP_TO_SINT:
  911. case ISD::FP_TO_UINT:
  912. case ISD::SINT_TO_FP:
  913. case ISD::UINT_TO_FP:
  914. if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) {
  915. // The cost of convert from or to mask vector is different from other
  916. // cases. We could not use PowDiff to calculate it.
  917. // For mask vector to fp, we should use the following instructions:
  918. // vmv.v.i v8, 0
  919. // vmerge.vim v8, v8, -1, v0
  920. // vfcvt.f.x.v v8, v8
  921. // And for fp vector to mask, we use:
  922. // vfncvt.rtz.x.f.w v9, v8
  923. // vand.vi v8, v9, 1
  924. // vmsne.vi v0, v8, 0
  925. return 3;
  926. }
  927. if (std::abs(PowDiff) <= 1)
  928. return 1;
  929. // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
  930. // so it only need two conversion.
  931. if (Src->isIntOrIntVectorTy())
  932. return 2;
  933. // Counts of narrow/widen instructions.
  934. return std::abs(PowDiff);
  935. }
  936. }
  937. return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
  938. }
  939. unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
  940. if (isa<ScalableVectorType>(Ty)) {
  941. const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
  942. const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
  943. const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
  944. return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
  945. }
  946. return cast<FixedVectorType>(Ty)->getNumElements();
  947. }
  948. InstructionCost
  949. RISCVTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
  950. bool IsUnsigned,
  951. TTI::TargetCostKind CostKind) {
  952. if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
  953. return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
  954. // Skip if scalar size of Ty is bigger than ELEN.
  955. if (Ty->getScalarSizeInBits() > ST->getELEN())
  956. return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
  957. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
  958. if (Ty->getElementType()->isIntegerTy(1))
  959. // vcpop sequences, see vreduction-mask.ll. umax, smin actually only
  960. // cost 2, but we don't have enough info here so we slightly over cost.
  961. return (LT.first - 1) + 3;
  962. // IR Reduction is composed by two vmv and one rvv reduction instruction.
  963. InstructionCost BaseCost = 2;
  964. unsigned VL = getEstimatedVLFor(Ty);
  965. return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
  966. }
  967. InstructionCost
  968. RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
  969. std::optional<FastMathFlags> FMF,
  970. TTI::TargetCostKind CostKind) {
  971. if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
  972. return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
  973. // Skip if scalar size of Ty is bigger than ELEN.
  974. if (Ty->getScalarSizeInBits() > ST->getELEN())
  975. return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
  976. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  977. assert(ISD && "Invalid opcode");
  978. if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
  979. ISD != ISD::FADD)
  980. return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
  981. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
  982. if (Ty->getElementType()->isIntegerTy(1))
  983. // vcpop sequences, see vreduction-mask.ll
  984. return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2);
  985. // IR Reduction is composed by two vmv and one rvv reduction instruction.
  986. InstructionCost BaseCost = 2;
  987. unsigned VL = getEstimatedVLFor(Ty);
  988. if (TTI::requiresOrderedReduction(FMF))
  989. return (LT.first - 1) + BaseCost + VL;
  990. return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
  991. }
  992. InstructionCost RISCVTTIImpl::getExtendedReductionCost(
  993. unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
  994. std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) {
  995. if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
  996. return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
  997. FMF, CostKind);
  998. // Skip if scalar size of ResTy is bigger than ELEN.
  999. if (ResTy->getScalarSizeInBits() > ST->getELEN())
  1000. return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
  1001. FMF, CostKind);
  1002. if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
  1003. return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
  1004. FMF, CostKind);
  1005. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
  1006. if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
  1007. return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
  1008. FMF, CostKind);
  1009. return (LT.first - 1) +
  1010. getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
  1011. }
  1012. InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty,
  1013. TTI::OperandValueInfo OpInfo,
  1014. TTI::TargetCostKind CostKind) {
  1015. assert(OpInfo.isConstant() && "non constant operand?");
  1016. if (!isa<VectorType>(Ty))
  1017. // FIXME: We need to account for immediate materialization here, but doing
  1018. // a decent job requires more knowledge about the immediate than we
  1019. // currently have here.
  1020. return 0;
  1021. if (OpInfo.isUniform())
  1022. // vmv.x.i, vmv.v.x, or vfmv.v.f
  1023. // We ignore the cost of the scalar constant materialization to be consistent
  1024. // with how we treat scalar constants themselves just above.
  1025. return 1;
  1026. // Add a cost of address generation + the cost of the vector load. The
  1027. // address is expected to be a PC relative offset to a constant pool entry
  1028. // using auipc/addi.
  1029. return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
  1030. /*AddressSpace=*/0, CostKind);
  1031. }
  1032. InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
  1033. MaybeAlign Alignment,
  1034. unsigned AddressSpace,
  1035. TTI::TargetCostKind CostKind,
  1036. TTI::OperandValueInfo OpInfo,
  1037. const Instruction *I) {
  1038. InstructionCost Cost = 0;
  1039. if (Opcode == Instruction::Store && OpInfo.isConstant())
  1040. Cost += getStoreImmCost(Src, OpInfo, CostKind);
  1041. return Cost + BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
  1042. CostKind, OpInfo, I);
  1043. }
  1044. InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
  1045. Type *CondTy,
  1046. CmpInst::Predicate VecPred,
  1047. TTI::TargetCostKind CostKind,
  1048. const Instruction *I) {
  1049. if (CostKind != TTI::TCK_RecipThroughput)
  1050. return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
  1051. I);
  1052. if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
  1053. return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
  1054. I);
  1055. // Skip if scalar size of ValTy is bigger than ELEN.
  1056. if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELEN())
  1057. return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
  1058. I);
  1059. if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
  1060. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
  1061. if (CondTy->isVectorTy()) {
  1062. if (ValTy->getScalarSizeInBits() == 1) {
  1063. // vmandn.mm v8, v8, v9
  1064. // vmand.mm v9, v0, v9
  1065. // vmor.mm v0, v9, v8
  1066. return LT.first * 3;
  1067. }
  1068. // vselect and max/min are supported natively.
  1069. return LT.first * 1;
  1070. }
  1071. if (ValTy->getScalarSizeInBits() == 1) {
  1072. // vmv.v.x v9, a0
  1073. // vmsne.vi v9, v9, 0
  1074. // vmandn.mm v8, v8, v9
  1075. // vmand.mm v9, v0, v9
  1076. // vmor.mm v0, v9, v8
  1077. return LT.first * 5;
  1078. }
  1079. // vmv.v.x v10, a0
  1080. // vmsne.vi v0, v10, 0
  1081. // vmerge.vvm v8, v9, v8, v0
  1082. return LT.first * 3;
  1083. }
  1084. if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
  1085. ValTy->isVectorTy()) {
  1086. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
  1087. // Support natively.
  1088. if (CmpInst::isIntPredicate(VecPred))
  1089. return LT.first * 1;
  1090. // If we do not support the input floating point vector type, use the base
  1091. // one which will calculate as:
  1092. // ScalarizeCost + Num * Cost for fixed vector,
  1093. // InvalidCost for scalable vector.
  1094. if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
  1095. (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
  1096. (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
  1097. return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
  1098. I);
  1099. switch (VecPred) {
  1100. // Support natively.
  1101. case CmpInst::FCMP_OEQ:
  1102. case CmpInst::FCMP_OGT:
  1103. case CmpInst::FCMP_OGE:
  1104. case CmpInst::FCMP_OLT:
  1105. case CmpInst::FCMP_OLE:
  1106. case CmpInst::FCMP_UNE:
  1107. return LT.first * 1;
  1108. // TODO: Other comparisons?
  1109. default:
  1110. break;
  1111. }
  1112. }
  1113. // TODO: Add cost for scalar type.
  1114. return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
  1115. }
  1116. InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
  1117. TTI::TargetCostKind CostKind,
  1118. unsigned Index, Value *Op0,
  1119. Value *Op1) {
  1120. assert(Val->isVectorTy() && "This must be a vector type");
  1121. if (Opcode != Instruction::ExtractElement &&
  1122. Opcode != Instruction::InsertElement)
  1123. return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
  1124. // Legalize the type.
  1125. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
  1126. // This type is legalized to a scalar type.
  1127. if (!LT.second.isVector())
  1128. return 0;
  1129. // For unsupported scalable vector.
  1130. if (LT.second.isScalableVector() && !LT.first.isValid())
  1131. return LT.first;
  1132. if (!isTypeLegal(Val))
  1133. return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
  1134. // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
  1135. // and vslideup + vmv.s.x to insert element to vector.
  1136. unsigned BaseCost = 1;
  1137. // When insertelement we should add the index with 1 as the input of vslideup.
  1138. unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
  1139. if (Index != -1U) {
  1140. // The type may be split. For fixed-width vectors we can normalize the
  1141. // index to the new type.
  1142. if (LT.second.isFixedLengthVector()) {
  1143. unsigned Width = LT.second.getVectorNumElements();
  1144. Index = Index % Width;
  1145. }
  1146. // We could extract/insert the first element without vslidedown/vslideup.
  1147. if (Index == 0)
  1148. SlideCost = 0;
  1149. else if (Opcode == Instruction::InsertElement)
  1150. SlideCost = 1; // With a constant index, we do not need to use addi.
  1151. }
  1152. // Mask vector extract/insert element is different from normal case.
  1153. if (Val->getScalarSizeInBits() == 1) {
  1154. // For extractelement, we need the following instructions:
  1155. // vmv.v.i v8, 0
  1156. // vmerge.vim v8, v8, 1, v0
  1157. // vsetivli zero, 1, e8, m2, ta, mu (not count)
  1158. // vslidedown.vx v8, v8, a0
  1159. // vmv.x.s a0, v8
  1160. // For insertelement, we need the following instructions:
  1161. // vsetvli a2, zero, e8, m1, ta, mu (not count)
  1162. // vmv.s.x v8, a0
  1163. // vmv.v.i v9, 0
  1164. // vmerge.vim v9, v9, 1, v0
  1165. // addi a0, a1, 1
  1166. // vsetvli zero, a0, e8, m1, tu, mu (not count)
  1167. // vslideup.vx v9, v8, a1
  1168. // vsetvli a0, zero, e8, m1, ta, mu (not count)
  1169. // vand.vi v8, v9, 1
  1170. // vmsne.vi v0, v8, 0
  1171. // TODO: should we count these special vsetvlis?
  1172. BaseCost = Opcode == Instruction::InsertElement ? 5 : 3;
  1173. }
  1174. // Extract i64 in the target that has XLEN=32 need more instruction.
  1175. if (Val->getScalarType()->isIntegerTy() &&
  1176. ST->getXLen() < Val->getScalarSizeInBits()) {
  1177. // For extractelement, we need the following instructions:
  1178. // vsetivli zero, 1, e64, m1, ta, mu (not count)
  1179. // vslidedown.vx v8, v8, a0
  1180. // vmv.x.s a0, v8
  1181. // li a1, 32
  1182. // vsrl.vx v8, v8, a1
  1183. // vmv.x.s a1, v8
  1184. // For insertelement, we need the following instructions:
  1185. // vsetivli zero, 2, e32, m4, ta, mu (not count)
  1186. // vmv.v.i v12, 0
  1187. // vslide1up.vx v16, v12, a1
  1188. // vslide1up.vx v12, v16, a0
  1189. // addi a0, a2, 1
  1190. // vsetvli zero, a0, e64, m4, tu, mu (not count)
  1191. // vslideup.vx v8, v12, a2
  1192. // TODO: should we count these special vsetvlis?
  1193. BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
  1194. }
  1195. return BaseCost + SlideCost;
  1196. }
  1197. InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
  1198. unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
  1199. TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
  1200. ArrayRef<const Value *> Args, const Instruction *CxtI) {
  1201. // TODO: Handle more cost kinds.
  1202. if (CostKind != TTI::TCK_RecipThroughput)
  1203. return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
  1204. Args, CxtI);
  1205. if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
  1206. return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
  1207. Args, CxtI);
  1208. // Skip if scalar size of Ty is bigger than ELEN.
  1209. if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELEN())
  1210. return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
  1211. Args, CxtI);
  1212. // Legalize the type.
  1213. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
  1214. // TODO: Handle scalar type.
  1215. if (!LT.second.isVector())
  1216. return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
  1217. Args, CxtI);
  1218. auto getConstantMatCost =
  1219. [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
  1220. if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
  1221. // Two sub-cases:
  1222. // * Has a 5 bit immediate operand which can be splatted.
  1223. // * Has a larger immediate which must be materialized in scalar register
  1224. // We return 0 for both as we currently ignore the cost of materializing
  1225. // scalar constants in GPRs.
  1226. return 0;
  1227. // Add a cost of address generation + the cost of the vector load. The
  1228. // address is expected to be a PC relative offset to a constant pool entry
  1229. // using auipc/addi.
  1230. return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
  1231. /*AddressSpace=*/0, CostKind);
  1232. };
  1233. // Add the cost of materializing any constant vectors required.
  1234. InstructionCost ConstantMatCost = 0;
  1235. if (Op1Info.isConstant())
  1236. ConstantMatCost += getConstantMatCost(0, Op1Info);
  1237. if (Op2Info.isConstant())
  1238. ConstantMatCost += getConstantMatCost(1, Op2Info);
  1239. switch (TLI->InstructionOpcodeToISD(Opcode)) {
  1240. case ISD::ADD:
  1241. case ISD::SUB:
  1242. case ISD::AND:
  1243. case ISD::OR:
  1244. case ISD::XOR:
  1245. case ISD::SHL:
  1246. case ISD::SRL:
  1247. case ISD::SRA:
  1248. case ISD::MUL:
  1249. case ISD::MULHS:
  1250. case ISD::MULHU:
  1251. case ISD::FADD:
  1252. case ISD::FSUB:
  1253. case ISD::FMUL:
  1254. case ISD::FNEG: {
  1255. return ConstantMatCost + getLMULCost(LT.second) * LT.first * 1;
  1256. }
  1257. default:
  1258. return ConstantMatCost +
  1259. BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
  1260. Args, CxtI);
  1261. }
  1262. }
  1263. void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
  1264. TTI::UnrollingPreferences &UP,
  1265. OptimizationRemarkEmitter *ORE) {
  1266. // TODO: More tuning on benchmarks and metrics with changes as needed
  1267. // would apply to all settings below to enable performance.
  1268. if (ST->enableDefaultUnroll())
  1269. return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
  1270. // Enable Upper bound unrolling universally, not dependant upon the conditions
  1271. // below.
  1272. UP.UpperBound = true;
  1273. // Disable loop unrolling for Oz and Os.
  1274. UP.OptSizeThreshold = 0;
  1275. UP.PartialOptSizeThreshold = 0;
  1276. if (L->getHeader()->getParent()->hasOptSize())
  1277. return;
  1278. SmallVector<BasicBlock *, 4> ExitingBlocks;
  1279. L->getExitingBlocks(ExitingBlocks);
  1280. LLVM_DEBUG(dbgs() << "Loop has:\n"
  1281. << "Blocks: " << L->getNumBlocks() << "\n"
  1282. << "Exit blocks: " << ExitingBlocks.size() << "\n");
  1283. // Only allow another exit other than the latch. This acts as an early exit
  1284. // as it mirrors the profitability calculation of the runtime unroller.
  1285. if (ExitingBlocks.size() > 2)
  1286. return;
  1287. // Limit the CFG of the loop body for targets with a branch predictor.
  1288. // Allowing 4 blocks permits if-then-else diamonds in the body.
  1289. if (L->getNumBlocks() > 4)
  1290. return;
  1291. // Don't unroll vectorized loops, including the remainder loop
  1292. if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
  1293. return;
  1294. // Scan the loop: don't unroll loops with calls as this could prevent
  1295. // inlining.
  1296. InstructionCost Cost = 0;
  1297. for (auto *BB : L->getBlocks()) {
  1298. for (auto &I : *BB) {
  1299. // Initial setting - Don't unroll loops containing vectorized
  1300. // instructions.
  1301. if (I.getType()->isVectorTy())
  1302. return;
  1303. if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
  1304. if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
  1305. if (!isLoweredToCall(F))
  1306. continue;
  1307. }
  1308. return;
  1309. }
  1310. SmallVector<const Value *> Operands(I.operand_values());
  1311. Cost += getInstructionCost(&I, Operands,
  1312. TargetTransformInfo::TCK_SizeAndLatency);
  1313. }
  1314. }
  1315. LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
  1316. UP.Partial = true;
  1317. UP.Runtime = true;
  1318. UP.UnrollRemainder = true;
  1319. UP.UnrollAndJam = true;
  1320. UP.UnrollAndJamInnerLoopThreshold = 60;
  1321. // Force unrolling small loops can be very useful because of the branch
  1322. // taken cost of the backedge.
  1323. if (Cost < 12)
  1324. UP.Force = true;
  1325. }
  1326. void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
  1327. TTI::PeelingPreferences &PP) {
  1328. BaseT::getPeelingPreferences(L, SE, PP);
  1329. }
  1330. unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
  1331. TypeSize Size = DL.getTypeSizeInBits(Ty);
  1332. if (Ty->isVectorTy()) {
  1333. if (Size.isScalable() && ST->hasVInstructions())
  1334. return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
  1335. if (ST->useRVVForFixedLengthVectors())
  1336. return divideCeil(Size, ST->getRealMinVLen());
  1337. }
  1338. return BaseT::getRegUsageForType(Ty);
  1339. }
  1340. unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
  1341. // This interface is currently only used by SLP. Returning 1 (which is the
  1342. // default value for SLPMaxVF) disables SLP. We currently have a cost modeling
  1343. // problem w/ constant materialization which causes SLP to perform majorly
  1344. // unprofitable transformations.
  1345. // TODO: Figure out constant materialization cost modeling and remove.
  1346. return SLPMaxVF;
  1347. }
  1348. bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
  1349. const TargetTransformInfo::LSRCost &C2) {
  1350. // RISCV specific here are "instruction number 1st priority".
  1351. return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
  1352. C1.NumIVMuls, C1.NumBaseAdds,
  1353. C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
  1354. std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
  1355. C2.NumIVMuls, C2.NumBaseAdds,
  1356. C2.ScaleCost, C2.ImmCost, C2.SetupCost);
  1357. }