AArch64TargetTransformInfo.cpp 104 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665
  1. //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #include "AArch64TargetTransformInfo.h"
  9. #include "AArch64ExpandImm.h"
  10. #include "MCTargetDesc/AArch64AddressingModes.h"
  11. #include "llvm/Analysis/IVDescriptors.h"
  12. #include "llvm/Analysis/LoopInfo.h"
  13. #include "llvm/Analysis/TargetTransformInfo.h"
  14. #include "llvm/CodeGen/BasicTTIImpl.h"
  15. #include "llvm/CodeGen/CostTable.h"
  16. #include "llvm/CodeGen/TargetLowering.h"
  17. #include "llvm/IR/Intrinsics.h"
  18. #include "llvm/IR/IntrinsicInst.h"
  19. #include "llvm/IR/IntrinsicsAArch64.h"
  20. #include "llvm/IR/PatternMatch.h"
  21. #include "llvm/Support/Debug.h"
  22. #include "llvm/Transforms/InstCombine/InstCombiner.h"
  23. #include <algorithm>
  24. using namespace llvm;
  25. using namespace llvm::PatternMatch;
  26. #define DEBUG_TYPE "aarch64tti"
  27. static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
  28. cl::init(true), cl::Hidden);
  29. static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
  30. cl::Hidden);
  31. static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
  32. cl::init(10), cl::Hidden);
  33. bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
  34. const Function *Callee) const {
  35. const TargetMachine &TM = getTLI()->getTargetMachine();
  36. const FeatureBitset &CallerBits =
  37. TM.getSubtargetImpl(*Caller)->getFeatureBits();
  38. const FeatureBitset &CalleeBits =
  39. TM.getSubtargetImpl(*Callee)->getFeatureBits();
  40. // Inline a callee if its target-features are a subset of the callers
  41. // target-features.
  42. return (CallerBits & CalleeBits) == CalleeBits;
  43. }
  44. /// Calculate the cost of materializing a 64-bit value. This helper
  45. /// method might only calculate a fraction of a larger immediate. Therefore it
  46. /// is valid to return a cost of ZERO.
  47. InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) {
  48. // Check if the immediate can be encoded within an instruction.
  49. if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
  50. return 0;
  51. if (Val < 0)
  52. Val = ~Val;
  53. // Calculate how many moves we will need to materialize this constant.
  54. SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
  55. AArch64_IMM::expandMOVImm(Val, 64, Insn);
  56. return Insn.size();
  57. }
  58. /// Calculate the cost of materializing the given constant.
  59. InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
  60. TTI::TargetCostKind CostKind) {
  61. assert(Ty->isIntegerTy());
  62. unsigned BitSize = Ty->getPrimitiveSizeInBits();
  63. if (BitSize == 0)
  64. return ~0U;
  65. // Sign-extend all constants to a multiple of 64-bit.
  66. APInt ImmVal = Imm;
  67. if (BitSize & 0x3f)
  68. ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
  69. // Split the constant into 64-bit chunks and calculate the cost for each
  70. // chunk.
  71. InstructionCost Cost = 0;
  72. for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
  73. APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
  74. int64_t Val = Tmp.getSExtValue();
  75. Cost += getIntImmCost(Val);
  76. }
  77. // We need at least one instruction to materialze the constant.
  78. return std::max<InstructionCost>(1, Cost);
  79. }
  80. InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
  81. const APInt &Imm, Type *Ty,
  82. TTI::TargetCostKind CostKind,
  83. Instruction *Inst) {
  84. assert(Ty->isIntegerTy());
  85. unsigned BitSize = Ty->getPrimitiveSizeInBits();
  86. // There is no cost model for constants with a bit size of 0. Return TCC_Free
  87. // here, so that constant hoisting will ignore this constant.
  88. if (BitSize == 0)
  89. return TTI::TCC_Free;
  90. unsigned ImmIdx = ~0U;
  91. switch (Opcode) {
  92. default:
  93. return TTI::TCC_Free;
  94. case Instruction::GetElementPtr:
  95. // Always hoist the base address of a GetElementPtr.
  96. if (Idx == 0)
  97. return 2 * TTI::TCC_Basic;
  98. return TTI::TCC_Free;
  99. case Instruction::Store:
  100. ImmIdx = 0;
  101. break;
  102. case Instruction::Add:
  103. case Instruction::Sub:
  104. case Instruction::Mul:
  105. case Instruction::UDiv:
  106. case Instruction::SDiv:
  107. case Instruction::URem:
  108. case Instruction::SRem:
  109. case Instruction::And:
  110. case Instruction::Or:
  111. case Instruction::Xor:
  112. case Instruction::ICmp:
  113. ImmIdx = 1;
  114. break;
  115. // Always return TCC_Free for the shift value of a shift instruction.
  116. case Instruction::Shl:
  117. case Instruction::LShr:
  118. case Instruction::AShr:
  119. if (Idx == 1)
  120. return TTI::TCC_Free;
  121. break;
  122. case Instruction::Trunc:
  123. case Instruction::ZExt:
  124. case Instruction::SExt:
  125. case Instruction::IntToPtr:
  126. case Instruction::PtrToInt:
  127. case Instruction::BitCast:
  128. case Instruction::PHI:
  129. case Instruction::Call:
  130. case Instruction::Select:
  131. case Instruction::Ret:
  132. case Instruction::Load:
  133. break;
  134. }
  135. if (Idx == ImmIdx) {
  136. int NumConstants = (BitSize + 63) / 64;
  137. InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
  138. return (Cost <= NumConstants * TTI::TCC_Basic)
  139. ? static_cast<int>(TTI::TCC_Free)
  140. : Cost;
  141. }
  142. return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
  143. }
  144. InstructionCost
  145. AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
  146. const APInt &Imm, Type *Ty,
  147. TTI::TargetCostKind CostKind) {
  148. assert(Ty->isIntegerTy());
  149. unsigned BitSize = Ty->getPrimitiveSizeInBits();
  150. // There is no cost model for constants with a bit size of 0. Return TCC_Free
  151. // here, so that constant hoisting will ignore this constant.
  152. if (BitSize == 0)
  153. return TTI::TCC_Free;
  154. // Most (all?) AArch64 intrinsics do not support folding immediates into the
  155. // selected instruction, so we compute the materialization cost for the
  156. // immediate directly.
  157. if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
  158. return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
  159. switch (IID) {
  160. default:
  161. return TTI::TCC_Free;
  162. case Intrinsic::sadd_with_overflow:
  163. case Intrinsic::uadd_with_overflow:
  164. case Intrinsic::ssub_with_overflow:
  165. case Intrinsic::usub_with_overflow:
  166. case Intrinsic::smul_with_overflow:
  167. case Intrinsic::umul_with_overflow:
  168. if (Idx == 1) {
  169. int NumConstants = (BitSize + 63) / 64;
  170. InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
  171. return (Cost <= NumConstants * TTI::TCC_Basic)
  172. ? static_cast<int>(TTI::TCC_Free)
  173. : Cost;
  174. }
  175. break;
  176. case Intrinsic::experimental_stackmap:
  177. if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
  178. return TTI::TCC_Free;
  179. break;
  180. case Intrinsic::experimental_patchpoint_void:
  181. case Intrinsic::experimental_patchpoint_i64:
  182. if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
  183. return TTI::TCC_Free;
  184. break;
  185. case Intrinsic::experimental_gc_statepoint:
  186. if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
  187. return TTI::TCC_Free;
  188. break;
  189. }
  190. return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
  191. }
  192. TargetTransformInfo::PopcntSupportKind
  193. AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
  194. assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
  195. if (TyWidth == 32 || TyWidth == 64)
  196. return TTI::PSK_FastHardware;
  197. // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
  198. return TTI::PSK_Software;
  199. }
  200. InstructionCost
  201. AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
  202. TTI::TargetCostKind CostKind) {
  203. auto *RetTy = ICA.getReturnType();
  204. switch (ICA.getID()) {
  205. case Intrinsic::umin:
  206. case Intrinsic::umax:
  207. case Intrinsic::smin:
  208. case Intrinsic::smax: {
  209. static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
  210. MVT::v8i16, MVT::v2i32, MVT::v4i32};
  211. auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
  212. // v2i64 types get converted to cmp+bif hence the cost of 2
  213. if (LT.second == MVT::v2i64)
  214. return LT.first * 2;
  215. if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
  216. return LT.first;
  217. break;
  218. }
  219. case Intrinsic::sadd_sat:
  220. case Intrinsic::ssub_sat:
  221. case Intrinsic::uadd_sat:
  222. case Intrinsic::usub_sat: {
  223. static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
  224. MVT::v8i16, MVT::v2i32, MVT::v4i32,
  225. MVT::v2i64};
  226. auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
  227. // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
  228. // need to extend the type, as it uses shr(qadd(shl, shl)).
  229. unsigned Instrs =
  230. LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
  231. if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
  232. return LT.first * Instrs;
  233. break;
  234. }
  235. case Intrinsic::abs: {
  236. static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
  237. MVT::v8i16, MVT::v2i32, MVT::v4i32,
  238. MVT::v2i64};
  239. auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
  240. if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
  241. return LT.first;
  242. break;
  243. }
  244. case Intrinsic::experimental_stepvector: {
  245. InstructionCost Cost = 1; // Cost of the `index' instruction
  246. auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
  247. // Legalisation of illegal vectors involves an `index' instruction plus
  248. // (LT.first - 1) vector adds.
  249. if (LT.first > 1) {
  250. Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
  251. InstructionCost AddCost =
  252. getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
  253. Cost += AddCost * (LT.first - 1);
  254. }
  255. return Cost;
  256. }
  257. case Intrinsic::bitreverse: {
  258. static const CostTblEntry BitreverseTbl[] = {
  259. {Intrinsic::bitreverse, MVT::i32, 1},
  260. {Intrinsic::bitreverse, MVT::i64, 1},
  261. {Intrinsic::bitreverse, MVT::v8i8, 1},
  262. {Intrinsic::bitreverse, MVT::v16i8, 1},
  263. {Intrinsic::bitreverse, MVT::v4i16, 2},
  264. {Intrinsic::bitreverse, MVT::v8i16, 2},
  265. {Intrinsic::bitreverse, MVT::v2i32, 2},
  266. {Intrinsic::bitreverse, MVT::v4i32, 2},
  267. {Intrinsic::bitreverse, MVT::v1i64, 2},
  268. {Intrinsic::bitreverse, MVT::v2i64, 2},
  269. };
  270. const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy);
  271. const auto *Entry =
  272. CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
  273. if (Entry) {
  274. // Cost Model is using the legal type(i32) that i8 and i16 will be
  275. // converted to +1 so that we match the actual lowering cost
  276. if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
  277. TLI->getValueType(DL, RetTy, true) == MVT::i16)
  278. return LegalisationCost.first * Entry->Cost + 1;
  279. return LegalisationCost.first * Entry->Cost;
  280. }
  281. break;
  282. }
  283. case Intrinsic::ctpop: {
  284. static const CostTblEntry CtpopCostTbl[] = {
  285. {ISD::CTPOP, MVT::v2i64, 4},
  286. {ISD::CTPOP, MVT::v4i32, 3},
  287. {ISD::CTPOP, MVT::v8i16, 2},
  288. {ISD::CTPOP, MVT::v16i8, 1},
  289. {ISD::CTPOP, MVT::i64, 4},
  290. {ISD::CTPOP, MVT::v2i32, 3},
  291. {ISD::CTPOP, MVT::v4i16, 2},
  292. {ISD::CTPOP, MVT::v8i8, 1},
  293. {ISD::CTPOP, MVT::i32, 5},
  294. };
  295. auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
  296. MVT MTy = LT.second;
  297. if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
  298. // Extra cost of +1 when illegal vector types are legalized by promoting
  299. // the integer type.
  300. int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
  301. RetTy->getScalarSizeInBits()
  302. ? 1
  303. : 0;
  304. return LT.first * Entry->Cost + ExtraCost;
  305. }
  306. break;
  307. }
  308. case Intrinsic::sadd_with_overflow:
  309. case Intrinsic::uadd_with_overflow:
  310. case Intrinsic::ssub_with_overflow:
  311. case Intrinsic::usub_with_overflow:
  312. case Intrinsic::smul_with_overflow:
  313. case Intrinsic::umul_with_overflow: {
  314. static const CostTblEntry WithOverflowCostTbl[] = {
  315. {Intrinsic::sadd_with_overflow, MVT::i8, 3},
  316. {Intrinsic::uadd_with_overflow, MVT::i8, 3},
  317. {Intrinsic::sadd_with_overflow, MVT::i16, 3},
  318. {Intrinsic::uadd_with_overflow, MVT::i16, 3},
  319. {Intrinsic::sadd_with_overflow, MVT::i32, 1},
  320. {Intrinsic::uadd_with_overflow, MVT::i32, 1},
  321. {Intrinsic::sadd_with_overflow, MVT::i64, 1},
  322. {Intrinsic::uadd_with_overflow, MVT::i64, 1},
  323. {Intrinsic::ssub_with_overflow, MVT::i8, 3},
  324. {Intrinsic::usub_with_overflow, MVT::i8, 3},
  325. {Intrinsic::ssub_with_overflow, MVT::i16, 3},
  326. {Intrinsic::usub_with_overflow, MVT::i16, 3},
  327. {Intrinsic::ssub_with_overflow, MVT::i32, 1},
  328. {Intrinsic::usub_with_overflow, MVT::i32, 1},
  329. {Intrinsic::ssub_with_overflow, MVT::i64, 1},
  330. {Intrinsic::usub_with_overflow, MVT::i64, 1},
  331. {Intrinsic::smul_with_overflow, MVT::i8, 5},
  332. {Intrinsic::umul_with_overflow, MVT::i8, 4},
  333. {Intrinsic::smul_with_overflow, MVT::i16, 5},
  334. {Intrinsic::umul_with_overflow, MVT::i16, 4},
  335. {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
  336. {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
  337. {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
  338. {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
  339. };
  340. EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
  341. if (MTy.isSimple())
  342. if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
  343. MTy.getSimpleVT()))
  344. return Entry->Cost;
  345. break;
  346. }
  347. default:
  348. break;
  349. }
  350. return BaseT::getIntrinsicInstrCost(ICA, CostKind);
  351. }
  352. /// The function will remove redundant reinterprets casting in the presence
  353. /// of the control flow
  354. static Optional<Instruction *> processPhiNode(InstCombiner &IC,
  355. IntrinsicInst &II) {
  356. SmallVector<Instruction *, 32> Worklist;
  357. auto RequiredType = II.getType();
  358. auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
  359. assert(PN && "Expected Phi Node!");
  360. // Don't create a new Phi unless we can remove the old one.
  361. if (!PN->hasOneUse())
  362. return None;
  363. for (Value *IncValPhi : PN->incoming_values()) {
  364. auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
  365. if (!Reinterpret ||
  366. Reinterpret->getIntrinsicID() !=
  367. Intrinsic::aarch64_sve_convert_to_svbool ||
  368. RequiredType != Reinterpret->getArgOperand(0)->getType())
  369. return None;
  370. }
  371. // Create the new Phi
  372. LLVMContext &Ctx = PN->getContext();
  373. IRBuilder<> Builder(Ctx);
  374. Builder.SetInsertPoint(PN);
  375. PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
  376. Worklist.push_back(PN);
  377. for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
  378. auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
  379. NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
  380. Worklist.push_back(Reinterpret);
  381. }
  382. // Cleanup Phi Node and reinterprets
  383. return IC.replaceInstUsesWith(II, NPN);
  384. }
  385. // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
  386. // => (binop (pred) (from_svbool _) (from_svbool _))
  387. //
  388. // The above transformation eliminates a `to_svbool` in the predicate
  389. // operand of bitwise operation `binop` by narrowing the vector width of
  390. // the operation. For example, it would convert a `<vscale x 16 x i1>
  391. // and` into a `<vscale x 4 x i1> and`. This is profitable because
  392. // to_svbool must zero the new lanes during widening, whereas
  393. // from_svbool is free.
  394. static Optional<Instruction *> tryCombineFromSVBoolBinOp(InstCombiner &IC,
  395. IntrinsicInst &II) {
  396. auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
  397. if (!BinOp)
  398. return None;
  399. auto IntrinsicID = BinOp->getIntrinsicID();
  400. switch (IntrinsicID) {
  401. case Intrinsic::aarch64_sve_and_z:
  402. case Intrinsic::aarch64_sve_bic_z:
  403. case Intrinsic::aarch64_sve_eor_z:
  404. case Intrinsic::aarch64_sve_nand_z:
  405. case Intrinsic::aarch64_sve_nor_z:
  406. case Intrinsic::aarch64_sve_orn_z:
  407. case Intrinsic::aarch64_sve_orr_z:
  408. break;
  409. default:
  410. return None;
  411. }
  412. auto BinOpPred = BinOp->getOperand(0);
  413. auto BinOpOp1 = BinOp->getOperand(1);
  414. auto BinOpOp2 = BinOp->getOperand(2);
  415. auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
  416. if (!PredIntr ||
  417. PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
  418. return None;
  419. auto PredOp = PredIntr->getOperand(0);
  420. auto PredOpTy = cast<VectorType>(PredOp->getType());
  421. if (PredOpTy != II.getType())
  422. return None;
  423. IRBuilder<> Builder(II.getContext());
  424. Builder.SetInsertPoint(&II);
  425. SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
  426. auto NarrowBinOpOp1 = Builder.CreateIntrinsic(
  427. Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
  428. NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
  429. if (BinOpOp1 == BinOpOp2)
  430. NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
  431. else
  432. NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic(
  433. Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
  434. auto NarrowedBinOp =
  435. Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
  436. return IC.replaceInstUsesWith(II, NarrowedBinOp);
  437. }
  438. static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC,
  439. IntrinsicInst &II) {
  440. // If the reinterpret instruction operand is a PHI Node
  441. if (isa<PHINode>(II.getArgOperand(0)))
  442. return processPhiNode(IC, II);
  443. if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
  444. return BinOpCombine;
  445. SmallVector<Instruction *, 32> CandidatesForRemoval;
  446. Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
  447. const auto *IVTy = cast<VectorType>(II.getType());
  448. // Walk the chain of conversions.
  449. while (Cursor) {
  450. // If the type of the cursor has fewer lanes than the final result, zeroing
  451. // must take place, which breaks the equivalence chain.
  452. const auto *CursorVTy = cast<VectorType>(Cursor->getType());
  453. if (CursorVTy->getElementCount().getKnownMinValue() <
  454. IVTy->getElementCount().getKnownMinValue())
  455. break;
  456. // If the cursor has the same type as I, it is a viable replacement.
  457. if (Cursor->getType() == IVTy)
  458. EarliestReplacement = Cursor;
  459. auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
  460. // If this is not an SVE conversion intrinsic, this is the end of the chain.
  461. if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
  462. Intrinsic::aarch64_sve_convert_to_svbool ||
  463. IntrinsicCursor->getIntrinsicID() ==
  464. Intrinsic::aarch64_sve_convert_from_svbool))
  465. break;
  466. CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
  467. Cursor = IntrinsicCursor->getOperand(0);
  468. }
  469. // If no viable replacement in the conversion chain was found, there is
  470. // nothing to do.
  471. if (!EarliestReplacement)
  472. return None;
  473. return IC.replaceInstUsesWith(II, EarliestReplacement);
  474. }
  475. static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
  476. IntrinsicInst &II) {
  477. IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
  478. if (!Pg)
  479. return None;
  480. if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
  481. return None;
  482. const auto PTruePattern =
  483. cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
  484. if (PTruePattern != AArch64SVEPredPattern::vl1)
  485. return None;
  486. // The intrinsic is inserting into lane zero so use an insert instead.
  487. auto *IdxTy = Type::getInt64Ty(II.getContext());
  488. auto *Insert = InsertElementInst::Create(
  489. II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
  490. Insert->insertBefore(&II);
  491. Insert->takeName(&II);
  492. return IC.replaceInstUsesWith(II, Insert);
  493. }
  494. static Optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
  495. IntrinsicInst &II) {
  496. // Replace DupX with a regular IR splat.
  497. IRBuilder<> Builder(II.getContext());
  498. Builder.SetInsertPoint(&II);
  499. auto *RetTy = cast<ScalableVectorType>(II.getType());
  500. Value *Splat =
  501. Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0));
  502. Splat->takeName(&II);
  503. return IC.replaceInstUsesWith(II, Splat);
  504. }
  505. static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
  506. IntrinsicInst &II) {
  507. LLVMContext &Ctx = II.getContext();
  508. IRBuilder<> Builder(Ctx);
  509. Builder.SetInsertPoint(&II);
  510. // Check that the predicate is all active
  511. auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
  512. if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
  513. return None;
  514. const auto PTruePattern =
  515. cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
  516. if (PTruePattern != AArch64SVEPredPattern::all)
  517. return None;
  518. // Check that we have a compare of zero..
  519. auto *SplatValue =
  520. dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
  521. if (!SplatValue || !SplatValue->isZero())
  522. return None;
  523. // ..against a dupq
  524. auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
  525. if (!DupQLane ||
  526. DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
  527. return None;
  528. // Where the dupq is a lane 0 replicate of a vector insert
  529. if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
  530. return None;
  531. auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
  532. if (!VecIns ||
  533. VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert)
  534. return None;
  535. // Where the vector insert is a fixed constant vector insert into undef at
  536. // index zero
  537. if (!isa<UndefValue>(VecIns->getArgOperand(0)))
  538. return None;
  539. if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
  540. return None;
  541. auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
  542. if (!ConstVec)
  543. return None;
  544. auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
  545. auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
  546. if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
  547. return None;
  548. unsigned NumElts = VecTy->getNumElements();
  549. unsigned PredicateBits = 0;
  550. // Expand intrinsic operands to a 16-bit byte level predicate
  551. for (unsigned I = 0; I < NumElts; ++I) {
  552. auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
  553. if (!Arg)
  554. return None;
  555. if (!Arg->isZero())
  556. PredicateBits |= 1 << (I * (16 / NumElts));
  557. }
  558. // If all bits are zero bail early with an empty predicate
  559. if (PredicateBits == 0) {
  560. auto *PFalse = Constant::getNullValue(II.getType());
  561. PFalse->takeName(&II);
  562. return IC.replaceInstUsesWith(II, PFalse);
  563. }
  564. // Calculate largest predicate type used (where byte predicate is largest)
  565. unsigned Mask = 8;
  566. for (unsigned I = 0; I < 16; ++I)
  567. if ((PredicateBits & (1 << I)) != 0)
  568. Mask |= (I % 8);
  569. unsigned PredSize = Mask & -Mask;
  570. auto *PredType = ScalableVectorType::get(
  571. Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
  572. // Ensure all relevant bits are set
  573. for (unsigned I = 0; I < 16; I += PredSize)
  574. if ((PredicateBits & (1 << I)) == 0)
  575. return None;
  576. auto *PTruePat =
  577. ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
  578. auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
  579. {PredType}, {PTruePat});
  580. auto *ConvertToSVBool = Builder.CreateIntrinsic(
  581. Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
  582. auto *ConvertFromSVBool =
  583. Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
  584. {II.getType()}, {ConvertToSVBool});
  585. ConvertFromSVBool->takeName(&II);
  586. return IC.replaceInstUsesWith(II, ConvertFromSVBool);
  587. }
  588. static Optional<Instruction *> instCombineSVELast(InstCombiner &IC,
  589. IntrinsicInst &II) {
  590. IRBuilder<> Builder(II.getContext());
  591. Builder.SetInsertPoint(&II);
  592. Value *Pg = II.getArgOperand(0);
  593. Value *Vec = II.getArgOperand(1);
  594. auto IntrinsicID = II.getIntrinsicID();
  595. bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
  596. // lastX(splat(X)) --> X
  597. if (auto *SplatVal = getSplatValue(Vec))
  598. return IC.replaceInstUsesWith(II, SplatVal);
  599. // If x and/or y is a splat value then:
  600. // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
  601. Value *LHS, *RHS;
  602. if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
  603. if (isSplatValue(LHS) || isSplatValue(RHS)) {
  604. auto *OldBinOp = cast<BinaryOperator>(Vec);
  605. auto OpC = OldBinOp->getOpcode();
  606. auto *NewLHS =
  607. Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
  608. auto *NewRHS =
  609. Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
  610. auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
  611. OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II);
  612. return IC.replaceInstUsesWith(II, NewBinOp);
  613. }
  614. }
  615. auto *C = dyn_cast<Constant>(Pg);
  616. if (IsAfter && C && C->isNullValue()) {
  617. // The intrinsic is extracting lane 0 so use an extract instead.
  618. auto *IdxTy = Type::getInt64Ty(II.getContext());
  619. auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
  620. Extract->insertBefore(&II);
  621. Extract->takeName(&II);
  622. return IC.replaceInstUsesWith(II, Extract);
  623. }
  624. auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
  625. if (!IntrPG)
  626. return None;
  627. if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
  628. return None;
  629. const auto PTruePattern =
  630. cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
  631. // Can the intrinsic's predicate be converted to a known constant index?
  632. unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
  633. if (!MinNumElts)
  634. return None;
  635. unsigned Idx = MinNumElts - 1;
  636. // Increment the index if extracting the element after the last active
  637. // predicate element.
  638. if (IsAfter)
  639. ++Idx;
  640. // Ignore extracts whose index is larger than the known minimum vector
  641. // length. NOTE: This is an artificial constraint where we prefer to
  642. // maintain what the user asked for until an alternative is proven faster.
  643. auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
  644. if (Idx >= PgVTy->getMinNumElements())
  645. return None;
  646. // The intrinsic is extracting a fixed lane so use an extract instead.
  647. auto *IdxTy = Type::getInt64Ty(II.getContext());
  648. auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
  649. Extract->insertBefore(&II);
  650. Extract->takeName(&II);
  651. return IC.replaceInstUsesWith(II, Extract);
  652. }
  653. static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
  654. IntrinsicInst &II) {
  655. LLVMContext &Ctx = II.getContext();
  656. IRBuilder<> Builder(Ctx);
  657. Builder.SetInsertPoint(&II);
  658. // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
  659. // can work with RDFFR_PP for ptest elimination.
  660. auto *AllPat =
  661. ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
  662. auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
  663. {II.getType()}, {AllPat});
  664. auto *RDFFR =
  665. Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
  666. RDFFR->takeName(&II);
  667. return IC.replaceInstUsesWith(II, RDFFR);
  668. }
  669. static Optional<Instruction *>
  670. instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
  671. const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
  672. if (Pattern == AArch64SVEPredPattern::all) {
  673. LLVMContext &Ctx = II.getContext();
  674. IRBuilder<> Builder(Ctx);
  675. Builder.SetInsertPoint(&II);
  676. Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
  677. auto *VScale = Builder.CreateVScale(StepVal);
  678. VScale->takeName(&II);
  679. return IC.replaceInstUsesWith(II, VScale);
  680. }
  681. unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
  682. return MinNumElts && NumElts >= MinNumElts
  683. ? Optional<Instruction *>(IC.replaceInstUsesWith(
  684. II, ConstantInt::get(II.getType(), MinNumElts)))
  685. : None;
  686. }
  687. static Optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
  688. IntrinsicInst &II) {
  689. IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
  690. IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
  691. if (Op1 && Op2 &&
  692. Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
  693. Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
  694. Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) {
  695. IRBuilder<> Builder(II.getContext());
  696. Builder.SetInsertPoint(&II);
  697. Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)};
  698. Type *Tys[] = {Op1->getArgOperand(0)->getType()};
  699. auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
  700. PTest->takeName(&II);
  701. return IC.replaceInstUsesWith(II, PTest);
  702. }
  703. return None;
  704. }
  705. static Optional<Instruction *> instCombineSVEVectorFMLA(InstCombiner &IC,
  706. IntrinsicInst &II) {
  707. // fold (fadd p a (fmul p b c)) -> (fma p a b c)
  708. Value *P = II.getOperand(0);
  709. Value *A = II.getOperand(1);
  710. auto FMul = II.getOperand(2);
  711. Value *B, *C;
  712. if (!match(FMul, m_Intrinsic<Intrinsic::aarch64_sve_fmul>(
  713. m_Specific(P), m_Value(B), m_Value(C))))
  714. return None;
  715. if (!FMul->hasOneUse())
  716. return None;
  717. llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
  718. // Stop the combine when the flags on the inputs differ in case dropping flags
  719. // would lead to us missing out on more beneficial optimizations.
  720. if (FAddFlags != cast<CallInst>(FMul)->getFastMathFlags())
  721. return None;
  722. if (!FAddFlags.allowContract())
  723. return None;
  724. IRBuilder<> Builder(II.getContext());
  725. Builder.SetInsertPoint(&II);
  726. auto FMLA = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmla,
  727. {II.getType()}, {P, A, B, C}, &II);
  728. FMLA->setFastMathFlags(FAddFlags);
  729. return IC.replaceInstUsesWith(II, FMLA);
  730. }
  731. static bool isAllActivePredicate(Value *Pred) {
  732. // Look through convert.from.svbool(convert.to.svbool(...) chain.
  733. Value *UncastedPred;
  734. if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
  735. m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
  736. m_Value(UncastedPred)))))
  737. // If the predicate has the same or less lanes than the uncasted
  738. // predicate then we know the casting has no effect.
  739. if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
  740. cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
  741. Pred = UncastedPred;
  742. return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
  743. m_ConstantInt<AArch64SVEPredPattern::all>()));
  744. }
  745. static Optional<Instruction *>
  746. instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
  747. IRBuilder<> Builder(II.getContext());
  748. Builder.SetInsertPoint(&II);
  749. Value *Pred = II.getOperand(0);
  750. Value *PtrOp = II.getOperand(1);
  751. Type *VecTy = II.getType();
  752. Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo());
  753. if (isAllActivePredicate(Pred)) {
  754. LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr);
  755. return IC.replaceInstUsesWith(II, Load);
  756. }
  757. CallInst *MaskedLoad =
  758. Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL),
  759. Pred, ConstantAggregateZero::get(VecTy));
  760. return IC.replaceInstUsesWith(II, MaskedLoad);
  761. }
  762. static Optional<Instruction *>
  763. instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
  764. IRBuilder<> Builder(II.getContext());
  765. Builder.SetInsertPoint(&II);
  766. Value *VecOp = II.getOperand(0);
  767. Value *Pred = II.getOperand(1);
  768. Value *PtrOp = II.getOperand(2);
  769. Value *VecPtr =
  770. Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo());
  771. if (isAllActivePredicate(Pred)) {
  772. Builder.CreateStore(VecOp, VecPtr);
  773. return IC.eraseInstFromFunction(II);
  774. }
  775. Builder.CreateMaskedStore(VecOp, VecPtr, PtrOp->getPointerAlignment(DL),
  776. Pred);
  777. return IC.eraseInstFromFunction(II);
  778. }
  779. static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
  780. switch (Intrinsic) {
  781. case Intrinsic::aarch64_sve_fmul:
  782. return Instruction::BinaryOps::FMul;
  783. case Intrinsic::aarch64_sve_fadd:
  784. return Instruction::BinaryOps::FAdd;
  785. case Intrinsic::aarch64_sve_fsub:
  786. return Instruction::BinaryOps::FSub;
  787. default:
  788. return Instruction::BinaryOpsEnd;
  789. }
  790. }
  791. static Optional<Instruction *> instCombineSVEVectorBinOp(InstCombiner &IC,
  792. IntrinsicInst &II) {
  793. auto *OpPredicate = II.getOperand(0);
  794. auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
  795. if (BinOpCode == Instruction::BinaryOpsEnd ||
  796. !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
  797. m_ConstantInt<AArch64SVEPredPattern::all>())))
  798. return None;
  799. IRBuilder<> Builder(II.getContext());
  800. Builder.SetInsertPoint(&II);
  801. Builder.setFastMathFlags(II.getFastMathFlags());
  802. auto BinOp =
  803. Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
  804. return IC.replaceInstUsesWith(II, BinOp);
  805. }
  806. static Optional<Instruction *> instCombineSVEVectorFAdd(InstCombiner &IC,
  807. IntrinsicInst &II) {
  808. if (auto FMLA = instCombineSVEVectorFMLA(IC, II))
  809. return FMLA;
  810. return instCombineSVEVectorBinOp(IC, II);
  811. }
  812. static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
  813. IntrinsicInst &II) {
  814. auto *OpPredicate = II.getOperand(0);
  815. auto *OpMultiplicand = II.getOperand(1);
  816. auto *OpMultiplier = II.getOperand(2);
  817. IRBuilder<> Builder(II.getContext());
  818. Builder.SetInsertPoint(&II);
  819. // Return true if a given instruction is a unit splat value, false otherwise.
  820. auto IsUnitSplat = [](auto *I) {
  821. auto *SplatValue = getSplatValue(I);
  822. if (!SplatValue)
  823. return false;
  824. return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
  825. };
  826. // Return true if a given instruction is an aarch64_sve_dup intrinsic call
  827. // with a unit splat value, false otherwise.
  828. auto IsUnitDup = [](auto *I) {
  829. auto *IntrI = dyn_cast<IntrinsicInst>(I);
  830. if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
  831. return false;
  832. auto *SplatValue = IntrI->getOperand(2);
  833. return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
  834. };
  835. if (IsUnitSplat(OpMultiplier)) {
  836. // [f]mul pg %n, (dupx 1) => %n
  837. OpMultiplicand->takeName(&II);
  838. return IC.replaceInstUsesWith(II, OpMultiplicand);
  839. } else if (IsUnitDup(OpMultiplier)) {
  840. // [f]mul pg %n, (dup pg 1) => %n
  841. auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
  842. auto *DupPg = DupInst->getOperand(1);
  843. // TODO: this is naive. The optimization is still valid if DupPg
  844. // 'encompasses' OpPredicate, not only if they're the same predicate.
  845. if (OpPredicate == DupPg) {
  846. OpMultiplicand->takeName(&II);
  847. return IC.replaceInstUsesWith(II, OpMultiplicand);
  848. }
  849. }
  850. return instCombineSVEVectorBinOp(IC, II);
  851. }
  852. static Optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
  853. IntrinsicInst &II) {
  854. IRBuilder<> Builder(II.getContext());
  855. Builder.SetInsertPoint(&II);
  856. Value *UnpackArg = II.getArgOperand(0);
  857. auto *RetTy = cast<ScalableVectorType>(II.getType());
  858. bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
  859. II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
  860. // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
  861. // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
  862. if (auto *ScalarArg = getSplatValue(UnpackArg)) {
  863. ScalarArg =
  864. Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
  865. Value *NewVal =
  866. Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
  867. NewVal->takeName(&II);
  868. return IC.replaceInstUsesWith(II, NewVal);
  869. }
  870. return None;
  871. }
  872. static Optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
  873. IntrinsicInst &II) {
  874. auto *OpVal = II.getOperand(0);
  875. auto *OpIndices = II.getOperand(1);
  876. VectorType *VTy = cast<VectorType>(II.getType());
  877. // Check whether OpIndices is a constant splat value < minimal element count
  878. // of result.
  879. auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
  880. if (!SplatValue ||
  881. SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
  882. return None;
  883. // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
  884. // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
  885. IRBuilder<> Builder(II.getContext());
  886. Builder.SetInsertPoint(&II);
  887. auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue);
  888. auto *VectorSplat =
  889. Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
  890. VectorSplat->takeName(&II);
  891. return IC.replaceInstUsesWith(II, VectorSplat);
  892. }
  893. static Optional<Instruction *> instCombineSVETupleGet(InstCombiner &IC,
  894. IntrinsicInst &II) {
  895. // Try to remove sequences of tuple get/set.
  896. Value *SetTuple, *SetIndex, *SetValue;
  897. auto *GetTuple = II.getArgOperand(0);
  898. auto *GetIndex = II.getArgOperand(1);
  899. // Check that we have tuple_get(GetTuple, GetIndex) where GetTuple is a
  900. // call to tuple_set i.e. tuple_set(SetTuple, SetIndex, SetValue).
  901. // Make sure that the types of the current intrinsic and SetValue match
  902. // in order to safely remove the sequence.
  903. if (!match(GetTuple,
  904. m_Intrinsic<Intrinsic::aarch64_sve_tuple_set>(
  905. m_Value(SetTuple), m_Value(SetIndex), m_Value(SetValue))) ||
  906. SetValue->getType() != II.getType())
  907. return None;
  908. // Case where we get the same index right after setting it.
  909. // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) --> SetValue
  910. if (GetIndex == SetIndex)
  911. return IC.replaceInstUsesWith(II, SetValue);
  912. // If we are getting a different index than what was set in the tuple_set
  913. // intrinsic. We can just set the input tuple to the one up in the chain.
  914. // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex)
  915. // --> tuple_get(SetTuple, GetIndex)
  916. return IC.replaceOperand(II, 0, SetTuple);
  917. }
  918. static Optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
  919. IntrinsicInst &II) {
  920. // zip1(uzp1(A, B), uzp2(A, B)) --> A
  921. // zip2(uzp1(A, B), uzp2(A, B)) --> B
  922. Value *A, *B;
  923. if (match(II.getArgOperand(0),
  924. m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
  925. match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
  926. m_Specific(A), m_Specific(B))))
  927. return IC.replaceInstUsesWith(
  928. II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
  929. return None;
  930. }
  931. static Optional<Instruction *> instCombineLD1GatherIndex(InstCombiner &IC,
  932. IntrinsicInst &II) {
  933. Value *Mask = II.getOperand(0);
  934. Value *BasePtr = II.getOperand(1);
  935. Value *Index = II.getOperand(2);
  936. Type *Ty = II.getType();
  937. Type *BasePtrTy = BasePtr->getType();
  938. Value *PassThru = ConstantAggregateZero::get(Ty);
  939. // Contiguous gather => masked load.
  940. // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
  941. // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
  942. Value *IndexBase;
  943. if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
  944. m_Value(IndexBase), m_SpecificInt(1)))) {
  945. IRBuilder<> Builder(II.getContext());
  946. Builder.SetInsertPoint(&II);
  947. Align Alignment =
  948. BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
  949. Type *VecPtrTy = PointerType::getUnqual(Ty);
  950. Value *Ptr = Builder.CreateGEP(BasePtrTy->getPointerElementType(), BasePtr,
  951. IndexBase);
  952. Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
  953. CallInst *MaskedLoad =
  954. Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
  955. MaskedLoad->takeName(&II);
  956. return IC.replaceInstUsesWith(II, MaskedLoad);
  957. }
  958. return None;
  959. }
  960. static Optional<Instruction *> instCombineST1ScatterIndex(InstCombiner &IC,
  961. IntrinsicInst &II) {
  962. Value *Val = II.getOperand(0);
  963. Value *Mask = II.getOperand(1);
  964. Value *BasePtr = II.getOperand(2);
  965. Value *Index = II.getOperand(3);
  966. Type *Ty = Val->getType();
  967. Type *BasePtrTy = BasePtr->getType();
  968. // Contiguous scatter => masked store.
  969. // (sve.ld1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
  970. // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
  971. Value *IndexBase;
  972. if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
  973. m_Value(IndexBase), m_SpecificInt(1)))) {
  974. IRBuilder<> Builder(II.getContext());
  975. Builder.SetInsertPoint(&II);
  976. Align Alignment =
  977. BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
  978. Value *Ptr = Builder.CreateGEP(BasePtrTy->getPointerElementType(), BasePtr,
  979. IndexBase);
  980. Type *VecPtrTy = PointerType::getUnqual(Ty);
  981. Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
  982. (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
  983. return IC.eraseInstFromFunction(II);
  984. }
  985. return None;
  986. }
  987. static Optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
  988. IntrinsicInst &II) {
  989. IRBuilder<> Builder(II.getContext());
  990. Builder.SetInsertPoint(&II);
  991. Type *Int32Ty = Builder.getInt32Ty();
  992. Value *Pred = II.getOperand(0);
  993. Value *Vec = II.getOperand(1);
  994. Value *DivVec = II.getOperand(2);
  995. Value *SplatValue = getSplatValue(DivVec);
  996. ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
  997. if (!SplatConstantInt)
  998. return None;
  999. APInt Divisor = SplatConstantInt->getValue();
  1000. if (Divisor.isPowerOf2()) {
  1001. Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
  1002. auto ASRD = Builder.CreateIntrinsic(
  1003. Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
  1004. return IC.replaceInstUsesWith(II, ASRD);
  1005. }
  1006. if (Divisor.isNegatedPowerOf2()) {
  1007. Divisor.negate();
  1008. Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
  1009. auto ASRD = Builder.CreateIntrinsic(
  1010. Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
  1011. auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg,
  1012. {ASRD->getType()}, {ASRD, Pred, ASRD});
  1013. return IC.replaceInstUsesWith(II, NEG);
  1014. }
  1015. return None;
  1016. }
  1017. Optional<Instruction *>
  1018. AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
  1019. IntrinsicInst &II) const {
  1020. Intrinsic::ID IID = II.getIntrinsicID();
  1021. switch (IID) {
  1022. default:
  1023. break;
  1024. case Intrinsic::aarch64_sve_convert_from_svbool:
  1025. return instCombineConvertFromSVBool(IC, II);
  1026. case Intrinsic::aarch64_sve_dup:
  1027. return instCombineSVEDup(IC, II);
  1028. case Intrinsic::aarch64_sve_dup_x:
  1029. return instCombineSVEDupX(IC, II);
  1030. case Intrinsic::aarch64_sve_cmpne:
  1031. case Intrinsic::aarch64_sve_cmpne_wide:
  1032. return instCombineSVECmpNE(IC, II);
  1033. case Intrinsic::aarch64_sve_rdffr:
  1034. return instCombineRDFFR(IC, II);
  1035. case Intrinsic::aarch64_sve_lasta:
  1036. case Intrinsic::aarch64_sve_lastb:
  1037. return instCombineSVELast(IC, II);
  1038. case Intrinsic::aarch64_sve_cntd:
  1039. return instCombineSVECntElts(IC, II, 2);
  1040. case Intrinsic::aarch64_sve_cntw:
  1041. return instCombineSVECntElts(IC, II, 4);
  1042. case Intrinsic::aarch64_sve_cnth:
  1043. return instCombineSVECntElts(IC, II, 8);
  1044. case Intrinsic::aarch64_sve_cntb:
  1045. return instCombineSVECntElts(IC, II, 16);
  1046. case Intrinsic::aarch64_sve_ptest_any:
  1047. case Intrinsic::aarch64_sve_ptest_first:
  1048. case Intrinsic::aarch64_sve_ptest_last:
  1049. return instCombineSVEPTest(IC, II);
  1050. case Intrinsic::aarch64_sve_mul:
  1051. case Intrinsic::aarch64_sve_fmul:
  1052. return instCombineSVEVectorMul(IC, II);
  1053. case Intrinsic::aarch64_sve_fadd:
  1054. return instCombineSVEVectorFAdd(IC, II);
  1055. case Intrinsic::aarch64_sve_fsub:
  1056. return instCombineSVEVectorBinOp(IC, II);
  1057. case Intrinsic::aarch64_sve_tbl:
  1058. return instCombineSVETBL(IC, II);
  1059. case Intrinsic::aarch64_sve_uunpkhi:
  1060. case Intrinsic::aarch64_sve_uunpklo:
  1061. case Intrinsic::aarch64_sve_sunpkhi:
  1062. case Intrinsic::aarch64_sve_sunpklo:
  1063. return instCombineSVEUnpack(IC, II);
  1064. case Intrinsic::aarch64_sve_tuple_get:
  1065. return instCombineSVETupleGet(IC, II);
  1066. case Intrinsic::aarch64_sve_zip1:
  1067. case Intrinsic::aarch64_sve_zip2:
  1068. return instCombineSVEZip(IC, II);
  1069. case Intrinsic::aarch64_sve_ld1_gather_index:
  1070. return instCombineLD1GatherIndex(IC, II);
  1071. case Intrinsic::aarch64_sve_st1_scatter_index:
  1072. return instCombineST1ScatterIndex(IC, II);
  1073. case Intrinsic::aarch64_sve_ld1:
  1074. return instCombineSVELD1(IC, II, DL);
  1075. case Intrinsic::aarch64_sve_st1:
  1076. return instCombineSVEST1(IC, II, DL);
  1077. case Intrinsic::aarch64_sve_sdiv:
  1078. return instCombineSVESDIV(IC, II);
  1079. }
  1080. return None;
  1081. }
  1082. Optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
  1083. InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
  1084. APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
  1085. std::function<void(Instruction *, unsigned, APInt, APInt &)>
  1086. SimplifyAndSetOp) const {
  1087. switch (II.getIntrinsicID()) {
  1088. default:
  1089. break;
  1090. case Intrinsic::aarch64_neon_fcvtxn:
  1091. case Intrinsic::aarch64_neon_rshrn:
  1092. case Intrinsic::aarch64_neon_sqrshrn:
  1093. case Intrinsic::aarch64_neon_sqrshrun:
  1094. case Intrinsic::aarch64_neon_sqshrn:
  1095. case Intrinsic::aarch64_neon_sqshrun:
  1096. case Intrinsic::aarch64_neon_sqxtn:
  1097. case Intrinsic::aarch64_neon_sqxtun:
  1098. case Intrinsic::aarch64_neon_uqrshrn:
  1099. case Intrinsic::aarch64_neon_uqshrn:
  1100. case Intrinsic::aarch64_neon_uqxtn:
  1101. SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
  1102. break;
  1103. }
  1104. return None;
  1105. }
  1106. bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
  1107. ArrayRef<const Value *> Args) {
  1108. // A helper that returns a vector type from the given type. The number of
  1109. // elements in type Ty determine the vector width.
  1110. auto toVectorTy = [&](Type *ArgTy) {
  1111. return VectorType::get(ArgTy->getScalarType(),
  1112. cast<VectorType>(DstTy)->getElementCount());
  1113. };
  1114. // Exit early if DstTy is not a vector type whose elements are at least
  1115. // 16-bits wide.
  1116. if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
  1117. return false;
  1118. // Determine if the operation has a widening variant. We consider both the
  1119. // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
  1120. // instructions.
  1121. //
  1122. // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
  1123. // verify that their extending operands are eliminated during code
  1124. // generation.
  1125. switch (Opcode) {
  1126. case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
  1127. case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
  1128. break;
  1129. default:
  1130. return false;
  1131. }
  1132. // To be a widening instruction (either the "wide" or "long" versions), the
  1133. // second operand must be a sign- or zero extend having a single user. We
  1134. // only consider extends having a single user because they may otherwise not
  1135. // be eliminated.
  1136. if (Args.size() != 2 ||
  1137. (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
  1138. !Args[1]->hasOneUse())
  1139. return false;
  1140. auto *Extend = cast<CastInst>(Args[1]);
  1141. // Legalize the destination type and ensure it can be used in a widening
  1142. // operation.
  1143. auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
  1144. unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
  1145. if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
  1146. return false;
  1147. // Legalize the source type and ensure it can be used in a widening
  1148. // operation.
  1149. auto *SrcTy = toVectorTy(Extend->getSrcTy());
  1150. auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
  1151. unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
  1152. if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
  1153. return false;
  1154. // Get the total number of vector elements in the legalized types.
  1155. InstructionCost NumDstEls =
  1156. DstTyL.first * DstTyL.second.getVectorMinNumElements();
  1157. InstructionCost NumSrcEls =
  1158. SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
  1159. // Return true if the legalized types have the same number of vector elements
  1160. // and the destination element type size is twice that of the source type.
  1161. return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
  1162. }
  1163. InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
  1164. Type *Src,
  1165. TTI::CastContextHint CCH,
  1166. TTI::TargetCostKind CostKind,
  1167. const Instruction *I) {
  1168. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  1169. assert(ISD && "Invalid opcode");
  1170. // If the cast is observable, and it is used by a widening instruction (e.g.,
  1171. // uaddl, saddw, etc.), it may be free.
  1172. if (I && I->hasOneUse()) {
  1173. auto *SingleUser = cast<Instruction>(*I->user_begin());
  1174. SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
  1175. if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
  1176. // If the cast is the second operand, it is free. We will generate either
  1177. // a "wide" or "long" version of the widening instruction.
  1178. if (I == SingleUser->getOperand(1))
  1179. return 0;
  1180. // If the cast is not the second operand, it will be free if it looks the
  1181. // same as the second operand. In this case, we will generate a "long"
  1182. // version of the widening instruction.
  1183. if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
  1184. if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
  1185. cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
  1186. return 0;
  1187. }
  1188. }
  1189. // TODO: Allow non-throughput costs that aren't binary.
  1190. auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
  1191. if (CostKind != TTI::TCK_RecipThroughput)
  1192. return Cost == 0 ? 0 : 1;
  1193. return Cost;
  1194. };
  1195. EVT SrcTy = TLI->getValueType(DL, Src);
  1196. EVT DstTy = TLI->getValueType(DL, Dst);
  1197. if (!SrcTy.isSimple() || !DstTy.isSimple())
  1198. return AdjustCost(
  1199. BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
  1200. static const TypeConversionCostTblEntry
  1201. ConversionTbl[] = {
  1202. { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
  1203. { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
  1204. { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
  1205. { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
  1206. // Truncations on nxvmiN
  1207. { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
  1208. { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
  1209. { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
  1210. { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
  1211. { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
  1212. { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
  1213. { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
  1214. { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
  1215. { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
  1216. { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 },
  1217. { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
  1218. { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
  1219. { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
  1220. { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
  1221. { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
  1222. { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },
  1223. // The number of shll instructions for the extension.
  1224. { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
  1225. { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
  1226. { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
  1227. { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
  1228. { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
  1229. { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
  1230. { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
  1231. { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
  1232. { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
  1233. { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
  1234. { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
  1235. { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
  1236. { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
  1237. { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
  1238. { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
  1239. { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
  1240. // LowerVectorINT_TO_FP:
  1241. { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
  1242. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
  1243. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
  1244. { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
  1245. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
  1246. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
  1247. // Complex: to v2f32
  1248. { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
  1249. { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
  1250. { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
  1251. { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
  1252. { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
  1253. { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
  1254. // Complex: to v4f32
  1255. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
  1256. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
  1257. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
  1258. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
  1259. // Complex: to v8f32
  1260. { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
  1261. { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
  1262. { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
  1263. { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
  1264. // Complex: to v16f32
  1265. { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
  1266. { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
  1267. // Complex: to v2f64
  1268. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
  1269. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
  1270. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
  1271. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
  1272. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
  1273. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
  1274. // LowerVectorFP_TO_INT
  1275. { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
  1276. { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
  1277. { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
  1278. { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
  1279. { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
  1280. { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
  1281. // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
  1282. { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
  1283. { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
  1284. { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
  1285. { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
  1286. { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
  1287. { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
  1288. // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
  1289. { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
  1290. { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
  1291. { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
  1292. { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
  1293. // Complex, from nxv2f32.
  1294. { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
  1295. { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
  1296. { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
  1297. { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
  1298. { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
  1299. { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
  1300. { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
  1301. { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
  1302. // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
  1303. { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
  1304. { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
  1305. { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
  1306. { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
  1307. { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
  1308. { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
  1309. // Complex, from nxv2f64.
  1310. { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
  1311. { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
  1312. { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
  1313. { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
  1314. { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
  1315. { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
  1316. { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
  1317. { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
  1318. // Complex, from nxv4f32.
  1319. { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
  1320. { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
  1321. { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
  1322. { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
  1323. { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
  1324. { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
  1325. { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
  1326. { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
  1327. // Complex, from nxv8f64. Illegal -> illegal conversions not required.
  1328. { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
  1329. { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
  1330. { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
  1331. { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
  1332. // Complex, from nxv4f64. Illegal -> illegal conversions not required.
  1333. { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
  1334. { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
  1335. { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
  1336. { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
  1337. { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
  1338. { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
  1339. // Complex, from nxv8f32. Illegal -> illegal conversions not required.
  1340. { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
  1341. { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
  1342. { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
  1343. { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
  1344. // Complex, from nxv8f16.
  1345. { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
  1346. { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
  1347. { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
  1348. { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
  1349. { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
  1350. { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
  1351. { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
  1352. { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
  1353. // Complex, from nxv4f16.
  1354. { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
  1355. { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
  1356. { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
  1357. { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
  1358. { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
  1359. { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
  1360. { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
  1361. { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
  1362. // Complex, from nxv2f16.
  1363. { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
  1364. { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
  1365. { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
  1366. { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
  1367. { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
  1368. { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
  1369. { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
  1370. { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
  1371. // Truncate from nxvmf32 to nxvmf16.
  1372. { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
  1373. { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
  1374. { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },
  1375. // Truncate from nxvmf64 to nxvmf16.
  1376. { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
  1377. { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
  1378. { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },
  1379. // Truncate from nxvmf64 to nxvmf32.
  1380. { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
  1381. { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
  1382. { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },
  1383. // Extend from nxvmf16 to nxvmf32.
  1384. { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
  1385. { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
  1386. { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
  1387. // Extend from nxvmf16 to nxvmf64.
  1388. { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
  1389. { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
  1390. { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
  1391. // Extend from nxvmf32 to nxvmf64.
  1392. { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
  1393. { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
  1394. { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
  1395. // Bitcasts from float to integer
  1396. { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 },
  1397. { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 },
  1398. { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 },
  1399. // Bitcasts from integer to float
  1400. { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 },
  1401. { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 },
  1402. { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 },
  1403. };
  1404. if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
  1405. DstTy.getSimpleVT(),
  1406. SrcTy.getSimpleVT()))
  1407. return AdjustCost(Entry->Cost);
  1408. return AdjustCost(
  1409. BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
  1410. }
  1411. InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode,
  1412. Type *Dst,
  1413. VectorType *VecTy,
  1414. unsigned Index) {
  1415. // Make sure we were given a valid extend opcode.
  1416. assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
  1417. "Invalid opcode");
  1418. // We are extending an element we extract from a vector, so the source type
  1419. // of the extend is the element type of the vector.
  1420. auto *Src = VecTy->getElementType();
  1421. // Sign- and zero-extends are for integer types only.
  1422. assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
  1423. // Get the cost for the extract. We compute the cost (if any) for the extend
  1424. // below.
  1425. InstructionCost Cost =
  1426. getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
  1427. // Legalize the types.
  1428. auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
  1429. auto DstVT = TLI->getValueType(DL, Dst);
  1430. auto SrcVT = TLI->getValueType(DL, Src);
  1431. TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
  1432. // If the resulting type is still a vector and the destination type is legal,
  1433. // we may get the extension for free. If not, get the default cost for the
  1434. // extend.
  1435. if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
  1436. return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
  1437. CostKind);
  1438. // The destination type should be larger than the element type. If not, get
  1439. // the default cost for the extend.
  1440. if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
  1441. return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
  1442. CostKind);
  1443. switch (Opcode) {
  1444. default:
  1445. llvm_unreachable("Opcode should be either SExt or ZExt");
  1446. // For sign-extends, we only need a smov, which performs the extension
  1447. // automatically.
  1448. case Instruction::SExt:
  1449. return Cost;
  1450. // For zero-extends, the extend is performed automatically by a umov unless
  1451. // the destination type is i64 and the element type is i8 or i16.
  1452. case Instruction::ZExt:
  1453. if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
  1454. return Cost;
  1455. }
  1456. // If we are unable to perform the extend for free, get the default cost.
  1457. return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
  1458. CostKind);
  1459. }
  1460. InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
  1461. TTI::TargetCostKind CostKind,
  1462. const Instruction *I) {
  1463. if (CostKind != TTI::TCK_RecipThroughput)
  1464. return Opcode == Instruction::PHI ? 0 : 1;
  1465. assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
  1466. // Branches are assumed to be predicted.
  1467. return 0;
  1468. }
  1469. InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
  1470. unsigned Index) {
  1471. assert(Val->isVectorTy() && "This must be a vector type");
  1472. if (Index != -1U) {
  1473. // Legalize the type.
  1474. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
  1475. // This type is legalized to a scalar type.
  1476. if (!LT.second.isVector())
  1477. return 0;
  1478. // The type may be split. For fixed-width vectors we can normalize the
  1479. // index to the new type.
  1480. if (LT.second.isFixedLengthVector()) {
  1481. unsigned Width = LT.second.getVectorNumElements();
  1482. Index = Index % Width;
  1483. }
  1484. // The element at index zero is already inside the vector.
  1485. if (Index == 0)
  1486. return 0;
  1487. }
  1488. // All other insert/extracts cost this much.
  1489. return ST->getVectorInsertExtractBaseCost();
  1490. }
  1491. InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
  1492. unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
  1493. TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
  1494. TTI::OperandValueProperties Opd1PropInfo,
  1495. TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
  1496. const Instruction *CxtI) {
  1497. // TODO: Handle more cost kinds.
  1498. if (CostKind != TTI::TCK_RecipThroughput)
  1499. return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
  1500. Opd2Info, Opd1PropInfo,
  1501. Opd2PropInfo, Args, CxtI);
  1502. // Legalize the type.
  1503. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
  1504. // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
  1505. // add in the widening overhead specified by the sub-target. Since the
  1506. // extends feeding widening instructions are performed automatically, they
  1507. // aren't present in the generated code and have a zero cost. By adding a
  1508. // widening overhead here, we attach the total cost of the combined operation
  1509. // to the widening instruction.
  1510. InstructionCost Cost = 0;
  1511. if (isWideningInstruction(Ty, Opcode, Args))
  1512. Cost += ST->getWideningBaseCost();
  1513. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  1514. switch (ISD) {
  1515. default:
  1516. return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
  1517. Opd2Info,
  1518. Opd1PropInfo, Opd2PropInfo);
  1519. case ISD::SDIV:
  1520. if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
  1521. Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
  1522. // On AArch64, scalar signed division by constants power-of-two are
  1523. // normally expanded to the sequence ADD + CMP + SELECT + SRA.
  1524. // The OperandValue properties many not be same as that of previous
  1525. // operation; conservatively assume OP_None.
  1526. Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
  1527. Opd1Info, Opd2Info,
  1528. TargetTransformInfo::OP_None,
  1529. TargetTransformInfo::OP_None);
  1530. Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
  1531. Opd1Info, Opd2Info,
  1532. TargetTransformInfo::OP_None,
  1533. TargetTransformInfo::OP_None);
  1534. Cost += getArithmeticInstrCost(Instruction::Select, Ty, CostKind,
  1535. Opd1Info, Opd2Info,
  1536. TargetTransformInfo::OP_None,
  1537. TargetTransformInfo::OP_None);
  1538. Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
  1539. Opd1Info, Opd2Info,
  1540. TargetTransformInfo::OP_None,
  1541. TargetTransformInfo::OP_None);
  1542. return Cost;
  1543. }
  1544. LLVM_FALLTHROUGH;
  1545. case ISD::UDIV:
  1546. if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) {
  1547. auto VT = TLI->getValueType(DL, Ty);
  1548. if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
  1549. // Vector signed division by constant are expanded to the
  1550. // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
  1551. // to MULHS + SUB + SRL + ADD + SRL.
  1552. InstructionCost MulCost = getArithmeticInstrCost(
  1553. Instruction::Mul, Ty, CostKind, Opd1Info, Opd2Info,
  1554. TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
  1555. InstructionCost AddCost = getArithmeticInstrCost(
  1556. Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info,
  1557. TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
  1558. InstructionCost ShrCost = getArithmeticInstrCost(
  1559. Instruction::AShr, Ty, CostKind, Opd1Info, Opd2Info,
  1560. TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
  1561. return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
  1562. }
  1563. }
  1564. Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
  1565. Opd2Info,
  1566. Opd1PropInfo, Opd2PropInfo);
  1567. if (Ty->isVectorTy()) {
  1568. // On AArch64, vector divisions are not supported natively and are
  1569. // expanded into scalar divisions of each pair of elements.
  1570. Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind,
  1571. Opd1Info, Opd2Info, Opd1PropInfo,
  1572. Opd2PropInfo);
  1573. Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
  1574. Opd1Info, Opd2Info, Opd1PropInfo,
  1575. Opd2PropInfo);
  1576. // TODO: if one of the arguments is scalar, then it's not necessary to
  1577. // double the cost of handling the vector elements.
  1578. Cost += Cost;
  1579. }
  1580. return Cost;
  1581. case ISD::MUL:
  1582. if (LT.second != MVT::v2i64)
  1583. return (Cost + 1) * LT.first;
  1584. // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive
  1585. // as elements are extracted from the vectors and the muls scalarized.
  1586. // As getScalarizationOverhead is a bit too pessimistic, we estimate the
  1587. // cost for a i64 vector directly here, which is:
  1588. // - four i64 extracts,
  1589. // - two i64 inserts, and
  1590. // - two muls.
  1591. // So, for a v2i64 with LT.First = 1 the cost is 8, and for a v4i64 with
  1592. // LT.first = 2 the cost is 16.
  1593. return LT.first * 8;
  1594. case ISD::ADD:
  1595. case ISD::XOR:
  1596. case ISD::OR:
  1597. case ISD::AND:
  1598. // These nodes are marked as 'custom' for combining purposes only.
  1599. // We know that they are legal. See LowerAdd in ISelLowering.
  1600. return (Cost + 1) * LT.first;
  1601. case ISD::FADD:
  1602. case ISD::FSUB:
  1603. case ISD::FMUL:
  1604. case ISD::FDIV:
  1605. case ISD::FNEG:
  1606. // These nodes are marked as 'custom' just to lower them to SVE.
  1607. // We know said lowering will incur no additional cost.
  1608. if (!Ty->getScalarType()->isFP128Ty())
  1609. return (Cost + 2) * LT.first;
  1610. return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
  1611. Opd2Info,
  1612. Opd1PropInfo, Opd2PropInfo);
  1613. }
  1614. }
  1615. InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty,
  1616. ScalarEvolution *SE,
  1617. const SCEV *Ptr) {
  1618. // Address computations in vectorized code with non-consecutive addresses will
  1619. // likely result in more instructions compared to scalar code where the
  1620. // computation can more often be merged into the index mode. The resulting
  1621. // extra micro-ops can significantly decrease throughput.
  1622. unsigned NumVectorInstToHideOverhead = 10;
  1623. int MaxMergeDistance = 64;
  1624. if (Ty->isVectorTy() && SE &&
  1625. !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
  1626. return NumVectorInstToHideOverhead;
  1627. // In many cases the address computation is not merged into the instruction
  1628. // addressing mode.
  1629. return 1;
  1630. }
  1631. InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
  1632. Type *CondTy,
  1633. CmpInst::Predicate VecPred,
  1634. TTI::TargetCostKind CostKind,
  1635. const Instruction *I) {
  1636. // TODO: Handle other cost kinds.
  1637. if (CostKind != TTI::TCK_RecipThroughput)
  1638. return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
  1639. I);
  1640. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  1641. // We don't lower some vector selects well that are wider than the register
  1642. // width.
  1643. if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
  1644. // We would need this many instructions to hide the scalarization happening.
  1645. const int AmortizationCost = 20;
  1646. // If VecPred is not set, check if we can get a predicate from the context
  1647. // instruction, if its type matches the requested ValTy.
  1648. if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
  1649. CmpInst::Predicate CurrentPred;
  1650. if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
  1651. m_Value())))
  1652. VecPred = CurrentPred;
  1653. }
  1654. // Check if we have a compare/select chain that can be lowered using
  1655. // a (F)CMxx & BFI pair.
  1656. if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
  1657. VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
  1658. VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
  1659. VecPred == CmpInst::FCMP_UNE) {
  1660. static const auto ValidMinMaxTys = {
  1661. MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
  1662. MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
  1663. static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
  1664. auto LT = TLI->getTypeLegalizationCost(DL, ValTy);
  1665. if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
  1666. (ST->hasFullFP16() &&
  1667. any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
  1668. return LT.first;
  1669. }
  1670. static const TypeConversionCostTblEntry
  1671. VectorSelectTbl[] = {
  1672. { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
  1673. { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
  1674. { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
  1675. { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
  1676. { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
  1677. { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
  1678. };
  1679. EVT SelCondTy = TLI->getValueType(DL, CondTy);
  1680. EVT SelValTy = TLI->getValueType(DL, ValTy);
  1681. if (SelCondTy.isSimple() && SelValTy.isSimple()) {
  1682. if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
  1683. SelCondTy.getSimpleVT(),
  1684. SelValTy.getSimpleVT()))
  1685. return Entry->Cost;
  1686. }
  1687. }
  1688. // The base case handles scalable vectors fine for now, since it treats the
  1689. // cost as 1 * legalization cost.
  1690. return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
  1691. }
  1692. AArch64TTIImpl::TTI::MemCmpExpansionOptions
  1693. AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
  1694. TTI::MemCmpExpansionOptions Options;
  1695. if (ST->requiresStrictAlign()) {
  1696. // TODO: Add cost modeling for strict align. Misaligned loads expand to
  1697. // a bunch of instructions when strict align is enabled.
  1698. return Options;
  1699. }
  1700. Options.AllowOverlappingLoads = true;
  1701. Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
  1702. Options.NumLoadsPerBlock = Options.MaxNumLoads;
  1703. // TODO: Though vector loads usually perform well on AArch64, in some targets
  1704. // they may wake up the FP unit, which raises the power consumption. Perhaps
  1705. // they could be used with no holds barred (-O3).
  1706. Options.LoadSizes = {8, 4, 2, 1};
  1707. return Options;
  1708. }
  1709. InstructionCost
  1710. AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
  1711. Align Alignment, unsigned AddressSpace,
  1712. TTI::TargetCostKind CostKind) {
  1713. if (useNeonVector(Src))
  1714. return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
  1715. CostKind);
  1716. auto LT = TLI->getTypeLegalizationCost(DL, Src);
  1717. if (!LT.first.isValid())
  1718. return InstructionCost::getInvalid();
  1719. // The code-generator is currently not able to handle scalable vectors
  1720. // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
  1721. // it. This change will be removed when code-generation for these types is
  1722. // sufficiently reliable.
  1723. if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
  1724. return InstructionCost::getInvalid();
  1725. return LT.first * 2;
  1726. }
  1727. static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
  1728. return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
  1729. }
  1730. InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
  1731. unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
  1732. Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
  1733. if (useNeonVector(DataTy))
  1734. return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
  1735. Alignment, CostKind, I);
  1736. auto *VT = cast<VectorType>(DataTy);
  1737. auto LT = TLI->getTypeLegalizationCost(DL, DataTy);
  1738. if (!LT.first.isValid())
  1739. return InstructionCost::getInvalid();
  1740. // The code-generator is currently not able to handle scalable vectors
  1741. // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
  1742. // it. This change will be removed when code-generation for these types is
  1743. // sufficiently reliable.
  1744. if (cast<VectorType>(DataTy)->getElementCount() ==
  1745. ElementCount::getScalable(1))
  1746. return InstructionCost::getInvalid();
  1747. ElementCount LegalVF = LT.second.getVectorElementCount();
  1748. InstructionCost MemOpCost =
  1749. getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I);
  1750. // Add on an overhead cost for using gathers/scatters.
  1751. // TODO: At the moment this is applied unilaterally for all CPUs, but at some
  1752. // point we may want a per-CPU overhead.
  1753. MemOpCost *= getSVEGatherScatterOverhead(Opcode);
  1754. return LT.first * MemOpCost * getMaxNumElements(LegalVF);
  1755. }
  1756. bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
  1757. return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
  1758. }
  1759. InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
  1760. MaybeAlign Alignment,
  1761. unsigned AddressSpace,
  1762. TTI::TargetCostKind CostKind,
  1763. const Instruction *I) {
  1764. EVT VT = TLI->getValueType(DL, Ty, true);
  1765. // Type legalization can't handle structs
  1766. if (VT == MVT::Other)
  1767. return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
  1768. CostKind);
  1769. auto LT = TLI->getTypeLegalizationCost(DL, Ty);
  1770. if (!LT.first.isValid())
  1771. return InstructionCost::getInvalid();
  1772. // The code-generator is currently not able to handle scalable vectors
  1773. // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
  1774. // it. This change will be removed when code-generation for these types is
  1775. // sufficiently reliable.
  1776. if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
  1777. if (VTy->getElementCount() == ElementCount::getScalable(1))
  1778. return InstructionCost::getInvalid();
  1779. // TODO: consider latency as well for TCK_SizeAndLatency.
  1780. if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
  1781. return LT.first;
  1782. if (CostKind != TTI::TCK_RecipThroughput)
  1783. return 1;
  1784. if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
  1785. LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
  1786. // Unaligned stores are extremely inefficient. We don't split all
  1787. // unaligned 128-bit stores because the negative impact that has shown in
  1788. // practice on inlined block copy code.
  1789. // We make such stores expensive so that we will only vectorize if there
  1790. // are 6 other instructions getting vectorized.
  1791. const int AmortizationCost = 6;
  1792. return LT.first * 2 * AmortizationCost;
  1793. }
  1794. // Check truncating stores and extending loads.
  1795. if (useNeonVector(Ty) &&
  1796. Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
  1797. // v4i8 types are lowered to scalar a load/store and sshll/xtn.
  1798. if (VT == MVT::v4i8)
  1799. return 2;
  1800. // Otherwise we need to scalarize.
  1801. return cast<FixedVectorType>(Ty)->getNumElements() * 2;
  1802. }
  1803. return LT.first;
  1804. }
  1805. InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
  1806. unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
  1807. Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
  1808. bool UseMaskForCond, bool UseMaskForGaps) {
  1809. assert(Factor >= 2 && "Invalid interleave factor");
  1810. auto *VecVTy = cast<FixedVectorType>(VecTy);
  1811. if (!UseMaskForCond && !UseMaskForGaps &&
  1812. Factor <= TLI->getMaxSupportedInterleaveFactor()) {
  1813. unsigned NumElts = VecVTy->getNumElements();
  1814. auto *SubVecTy =
  1815. FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
  1816. // ldN/stN only support legal vector types of size 64 or 128 in bits.
  1817. // Accesses having vector types that are a multiple of 128 bits can be
  1818. // matched to more than one ldN/stN instruction.
  1819. bool UseScalable;
  1820. if (NumElts % Factor == 0 &&
  1821. TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
  1822. return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
  1823. }
  1824. return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
  1825. Alignment, AddressSpace, CostKind,
  1826. UseMaskForCond, UseMaskForGaps);
  1827. }
  1828. InstructionCost
  1829. AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
  1830. InstructionCost Cost = 0;
  1831. TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
  1832. for (auto *I : Tys) {
  1833. if (!I->isVectorTy())
  1834. continue;
  1835. if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
  1836. 128)
  1837. Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
  1838. getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
  1839. }
  1840. return Cost;
  1841. }
  1842. unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
  1843. return ST->getMaxInterleaveFactor();
  1844. }
  1845. // For Falkor, we want to avoid having too many strided loads in a loop since
  1846. // that can exhaust the HW prefetcher resources. We adjust the unroller
  1847. // MaxCount preference below to attempt to ensure unrolling doesn't create too
  1848. // many strided loads.
  1849. static void
  1850. getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
  1851. TargetTransformInfo::UnrollingPreferences &UP) {
  1852. enum { MaxStridedLoads = 7 };
  1853. auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
  1854. int StridedLoads = 0;
  1855. // FIXME? We could make this more precise by looking at the CFG and
  1856. // e.g. not counting loads in each side of an if-then-else diamond.
  1857. for (const auto BB : L->blocks()) {
  1858. for (auto &I : *BB) {
  1859. LoadInst *LMemI = dyn_cast<LoadInst>(&I);
  1860. if (!LMemI)
  1861. continue;
  1862. Value *PtrValue = LMemI->getPointerOperand();
  1863. if (L->isLoopInvariant(PtrValue))
  1864. continue;
  1865. const SCEV *LSCEV = SE.getSCEV(PtrValue);
  1866. const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
  1867. if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
  1868. continue;
  1869. // FIXME? We could take pairing of unrolled load copies into account
  1870. // by looking at the AddRec, but we would probably have to limit this
  1871. // to loops with no stores or other memory optimization barriers.
  1872. ++StridedLoads;
  1873. // We've seen enough strided loads that seeing more won't make a
  1874. // difference.
  1875. if (StridedLoads > MaxStridedLoads / 2)
  1876. return StridedLoads;
  1877. }
  1878. }
  1879. return StridedLoads;
  1880. };
  1881. int StridedLoads = countStridedLoads(L, SE);
  1882. LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
  1883. << " strided loads\n");
  1884. // Pick the largest power of 2 unroll count that won't result in too many
  1885. // strided loads.
  1886. if (StridedLoads) {
  1887. UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
  1888. LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
  1889. << UP.MaxCount << '\n');
  1890. }
  1891. }
  1892. void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
  1893. TTI::UnrollingPreferences &UP,
  1894. OptimizationRemarkEmitter *ORE) {
  1895. // Enable partial unrolling and runtime unrolling.
  1896. BaseT::getUnrollingPreferences(L, SE, UP, ORE);
  1897. UP.UpperBound = true;
  1898. // For inner loop, it is more likely to be a hot one, and the runtime check
  1899. // can be promoted out from LICM pass, so the overhead is less, let's try
  1900. // a larger threshold to unroll more loops.
  1901. if (L->getLoopDepth() > 1)
  1902. UP.PartialThreshold *= 2;
  1903. // Disable partial & runtime unrolling on -Os.
  1904. UP.PartialOptSizeThreshold = 0;
  1905. if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
  1906. EnableFalkorHWPFUnrollFix)
  1907. getFalkorUnrollingPreferences(L, SE, UP);
  1908. // Scan the loop: don't unroll loops with calls as this could prevent
  1909. // inlining. Don't unroll vector loops either, as they don't benefit much from
  1910. // unrolling.
  1911. for (auto *BB : L->getBlocks()) {
  1912. for (auto &I : *BB) {
  1913. // Don't unroll vectorised loop.
  1914. if (I.getType()->isVectorTy())
  1915. return;
  1916. if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
  1917. if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
  1918. if (!isLoweredToCall(F))
  1919. continue;
  1920. }
  1921. return;
  1922. }
  1923. }
  1924. }
  1925. // Enable runtime unrolling for in-order models
  1926. // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
  1927. // checking for that case, we can ensure that the default behaviour is
  1928. // unchanged
  1929. if (ST->getProcFamily() != AArch64Subtarget::Others &&
  1930. !ST->getSchedModel().isOutOfOrder()) {
  1931. UP.Runtime = true;
  1932. UP.Partial = true;
  1933. UP.UnrollRemainder = true;
  1934. UP.DefaultUnrollRuntimeCount = 4;
  1935. UP.UnrollAndJam = true;
  1936. UP.UnrollAndJamInnerLoopThreshold = 60;
  1937. }
  1938. }
  1939. void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
  1940. TTI::PeelingPreferences &PP) {
  1941. BaseT::getPeelingPreferences(L, SE, PP);
  1942. }
  1943. Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
  1944. Type *ExpectedType) {
  1945. switch (Inst->getIntrinsicID()) {
  1946. default:
  1947. return nullptr;
  1948. case Intrinsic::aarch64_neon_st2:
  1949. case Intrinsic::aarch64_neon_st3:
  1950. case Intrinsic::aarch64_neon_st4: {
  1951. // Create a struct type
  1952. StructType *ST = dyn_cast<StructType>(ExpectedType);
  1953. if (!ST)
  1954. return nullptr;
  1955. unsigned NumElts = Inst->arg_size() - 1;
  1956. if (ST->getNumElements() != NumElts)
  1957. return nullptr;
  1958. for (unsigned i = 0, e = NumElts; i != e; ++i) {
  1959. if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
  1960. return nullptr;
  1961. }
  1962. Value *Res = UndefValue::get(ExpectedType);
  1963. IRBuilder<> Builder(Inst);
  1964. for (unsigned i = 0, e = NumElts; i != e; ++i) {
  1965. Value *L = Inst->getArgOperand(i);
  1966. Res = Builder.CreateInsertValue(Res, L, i);
  1967. }
  1968. return Res;
  1969. }
  1970. case Intrinsic::aarch64_neon_ld2:
  1971. case Intrinsic::aarch64_neon_ld3:
  1972. case Intrinsic::aarch64_neon_ld4:
  1973. if (Inst->getType() == ExpectedType)
  1974. return Inst;
  1975. return nullptr;
  1976. }
  1977. }
  1978. bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
  1979. MemIntrinsicInfo &Info) {
  1980. switch (Inst->getIntrinsicID()) {
  1981. default:
  1982. break;
  1983. case Intrinsic::aarch64_neon_ld2:
  1984. case Intrinsic::aarch64_neon_ld3:
  1985. case Intrinsic::aarch64_neon_ld4:
  1986. Info.ReadMem = true;
  1987. Info.WriteMem = false;
  1988. Info.PtrVal = Inst->getArgOperand(0);
  1989. break;
  1990. case Intrinsic::aarch64_neon_st2:
  1991. case Intrinsic::aarch64_neon_st3:
  1992. case Intrinsic::aarch64_neon_st4:
  1993. Info.ReadMem = false;
  1994. Info.WriteMem = true;
  1995. Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
  1996. break;
  1997. }
  1998. switch (Inst->getIntrinsicID()) {
  1999. default:
  2000. return false;
  2001. case Intrinsic::aarch64_neon_ld2:
  2002. case Intrinsic::aarch64_neon_st2:
  2003. Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
  2004. break;
  2005. case Intrinsic::aarch64_neon_ld3:
  2006. case Intrinsic::aarch64_neon_st3:
  2007. Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
  2008. break;
  2009. case Intrinsic::aarch64_neon_ld4:
  2010. case Intrinsic::aarch64_neon_st4:
  2011. Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
  2012. break;
  2013. }
  2014. return true;
  2015. }
  2016. /// See if \p I should be considered for address type promotion. We check if \p
  2017. /// I is a sext with right type and used in memory accesses. If it used in a
  2018. /// "complex" getelementptr, we allow it to be promoted without finding other
  2019. /// sext instructions that sign extended the same initial value. A getelementptr
  2020. /// is considered as "complex" if it has more than 2 operands.
  2021. bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
  2022. const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
  2023. bool Considerable = false;
  2024. AllowPromotionWithoutCommonHeader = false;
  2025. if (!isa<SExtInst>(&I))
  2026. return false;
  2027. Type *ConsideredSExtType =
  2028. Type::getInt64Ty(I.getParent()->getParent()->getContext());
  2029. if (I.getType() != ConsideredSExtType)
  2030. return false;
  2031. // See if the sext is the one with the right type and used in at least one
  2032. // GetElementPtrInst.
  2033. for (const User *U : I.users()) {
  2034. if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
  2035. Considerable = true;
  2036. // A getelementptr is considered as "complex" if it has more than 2
  2037. // operands. We will promote a SExt used in such complex GEP as we
  2038. // expect some computation to be merged if they are done on 64 bits.
  2039. if (GEPInst->getNumOperands() > 2) {
  2040. AllowPromotionWithoutCommonHeader = true;
  2041. break;
  2042. }
  2043. }
  2044. }
  2045. return Considerable;
  2046. }
  2047. bool AArch64TTIImpl::isLegalToVectorizeReduction(
  2048. const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
  2049. if (!VF.isScalable())
  2050. return true;
  2051. Type *Ty = RdxDesc.getRecurrenceType();
  2052. if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
  2053. return false;
  2054. switch (RdxDesc.getRecurrenceKind()) {
  2055. case RecurKind::Add:
  2056. case RecurKind::FAdd:
  2057. case RecurKind::And:
  2058. case RecurKind::Or:
  2059. case RecurKind::Xor:
  2060. case RecurKind::SMin:
  2061. case RecurKind::SMax:
  2062. case RecurKind::UMin:
  2063. case RecurKind::UMax:
  2064. case RecurKind::FMin:
  2065. case RecurKind::FMax:
  2066. case RecurKind::SelectICmp:
  2067. case RecurKind::SelectFCmp:
  2068. case RecurKind::FMulAdd:
  2069. return true;
  2070. default:
  2071. return false;
  2072. }
  2073. }
  2074. InstructionCost
  2075. AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
  2076. bool IsUnsigned,
  2077. TTI::TargetCostKind CostKind) {
  2078. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
  2079. if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
  2080. return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
  2081. assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) &&
  2082. "Both vector needs to be equally scalable");
  2083. InstructionCost LegalizationCost = 0;
  2084. if (LT.first > 1) {
  2085. Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
  2086. unsigned MinMaxOpcode =
  2087. Ty->isFPOrFPVectorTy()
  2088. ? Intrinsic::maxnum
  2089. : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin);
  2090. IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy});
  2091. LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
  2092. }
  2093. return LegalizationCost + /*Cost of horizontal reduction*/ 2;
  2094. }
  2095. InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
  2096. unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
  2097. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
  2098. InstructionCost LegalizationCost = 0;
  2099. if (LT.first > 1) {
  2100. Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
  2101. LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
  2102. LegalizationCost *= LT.first - 1;
  2103. }
  2104. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  2105. assert(ISD && "Invalid opcode");
  2106. // Add the final reduction cost for the legal horizontal reduction
  2107. switch (ISD) {
  2108. case ISD::ADD:
  2109. case ISD::AND:
  2110. case ISD::OR:
  2111. case ISD::XOR:
  2112. case ISD::FADD:
  2113. return LegalizationCost + 2;
  2114. default:
  2115. return InstructionCost::getInvalid();
  2116. }
  2117. }
  2118. InstructionCost
  2119. AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
  2120. Optional<FastMathFlags> FMF,
  2121. TTI::TargetCostKind CostKind) {
  2122. if (TTI::requiresOrderedReduction(FMF)) {
  2123. if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
  2124. InstructionCost BaseCost =
  2125. BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
  2126. // Add on extra cost to reflect the extra overhead on some CPUs. We still
  2127. // end up vectorizing for more computationally intensive loops.
  2128. return BaseCost + FixedVTy->getNumElements();
  2129. }
  2130. if (Opcode != Instruction::FAdd)
  2131. return InstructionCost::getInvalid();
  2132. auto *VTy = cast<ScalableVectorType>(ValTy);
  2133. InstructionCost Cost =
  2134. getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
  2135. Cost *= getMaxNumElements(VTy->getElementCount());
  2136. return Cost;
  2137. }
  2138. if (isa<ScalableVectorType>(ValTy))
  2139. return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
  2140. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
  2141. MVT MTy = LT.second;
  2142. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  2143. assert(ISD && "Invalid opcode");
  2144. // Horizontal adds can use the 'addv' instruction. We model the cost of these
  2145. // instructions as twice a normal vector add, plus 1 for each legalization
  2146. // step (LT.first). This is the only arithmetic vector reduction operation for
  2147. // which we have an instruction.
  2148. // OR, XOR and AND costs should match the codegen from:
  2149. // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
  2150. // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
  2151. // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
  2152. static const CostTblEntry CostTblNoPairwise[]{
  2153. {ISD::ADD, MVT::v8i8, 2},
  2154. {ISD::ADD, MVT::v16i8, 2},
  2155. {ISD::ADD, MVT::v4i16, 2},
  2156. {ISD::ADD, MVT::v8i16, 2},
  2157. {ISD::ADD, MVT::v4i32, 2},
  2158. {ISD::OR, MVT::v8i8, 15},
  2159. {ISD::OR, MVT::v16i8, 17},
  2160. {ISD::OR, MVT::v4i16, 7},
  2161. {ISD::OR, MVT::v8i16, 9},
  2162. {ISD::OR, MVT::v2i32, 3},
  2163. {ISD::OR, MVT::v4i32, 5},
  2164. {ISD::OR, MVT::v2i64, 3},
  2165. {ISD::XOR, MVT::v8i8, 15},
  2166. {ISD::XOR, MVT::v16i8, 17},
  2167. {ISD::XOR, MVT::v4i16, 7},
  2168. {ISD::XOR, MVT::v8i16, 9},
  2169. {ISD::XOR, MVT::v2i32, 3},
  2170. {ISD::XOR, MVT::v4i32, 5},
  2171. {ISD::XOR, MVT::v2i64, 3},
  2172. {ISD::AND, MVT::v8i8, 15},
  2173. {ISD::AND, MVT::v16i8, 17},
  2174. {ISD::AND, MVT::v4i16, 7},
  2175. {ISD::AND, MVT::v8i16, 9},
  2176. {ISD::AND, MVT::v2i32, 3},
  2177. {ISD::AND, MVT::v4i32, 5},
  2178. {ISD::AND, MVT::v2i64, 3},
  2179. };
  2180. switch (ISD) {
  2181. default:
  2182. break;
  2183. case ISD::ADD:
  2184. if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
  2185. return (LT.first - 1) + Entry->Cost;
  2186. break;
  2187. case ISD::XOR:
  2188. case ISD::AND:
  2189. case ISD::OR:
  2190. const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
  2191. if (!Entry)
  2192. break;
  2193. auto *ValVTy = cast<FixedVectorType>(ValTy);
  2194. if (!ValVTy->getElementType()->isIntegerTy(1) &&
  2195. MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
  2196. isPowerOf2_32(ValVTy->getNumElements())) {
  2197. InstructionCost ExtraCost = 0;
  2198. if (LT.first != 1) {
  2199. // Type needs to be split, so there is an extra cost of LT.first - 1
  2200. // arithmetic ops.
  2201. auto *Ty = FixedVectorType::get(ValTy->getElementType(),
  2202. MTy.getVectorNumElements());
  2203. ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
  2204. ExtraCost *= LT.first - 1;
  2205. }
  2206. return Entry->Cost + ExtraCost;
  2207. }
  2208. break;
  2209. }
  2210. return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
  2211. }
  2212. InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
  2213. static const CostTblEntry ShuffleTbl[] = {
  2214. { TTI::SK_Splice, MVT::nxv16i8, 1 },
  2215. { TTI::SK_Splice, MVT::nxv8i16, 1 },
  2216. { TTI::SK_Splice, MVT::nxv4i32, 1 },
  2217. { TTI::SK_Splice, MVT::nxv2i64, 1 },
  2218. { TTI::SK_Splice, MVT::nxv2f16, 1 },
  2219. { TTI::SK_Splice, MVT::nxv4f16, 1 },
  2220. { TTI::SK_Splice, MVT::nxv8f16, 1 },
  2221. { TTI::SK_Splice, MVT::nxv2bf16, 1 },
  2222. { TTI::SK_Splice, MVT::nxv4bf16, 1 },
  2223. { TTI::SK_Splice, MVT::nxv8bf16, 1 },
  2224. { TTI::SK_Splice, MVT::nxv2f32, 1 },
  2225. { TTI::SK_Splice, MVT::nxv4f32, 1 },
  2226. { TTI::SK_Splice, MVT::nxv2f64, 1 },
  2227. };
  2228. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
  2229. Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
  2230. TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
  2231. EVT PromotedVT = LT.second.getScalarType() == MVT::i1
  2232. ? TLI->getPromotedVTForPredicate(EVT(LT.second))
  2233. : LT.second;
  2234. Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
  2235. InstructionCost LegalizationCost = 0;
  2236. if (Index < 0) {
  2237. LegalizationCost =
  2238. getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
  2239. CmpInst::BAD_ICMP_PREDICATE, CostKind) +
  2240. getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
  2241. CmpInst::BAD_ICMP_PREDICATE, CostKind);
  2242. }
  2243. // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
  2244. // Cost performed on a promoted type.
  2245. if (LT.second.getScalarType() == MVT::i1) {
  2246. LegalizationCost +=
  2247. getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
  2248. TTI::CastContextHint::None, CostKind) +
  2249. getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
  2250. TTI::CastContextHint::None, CostKind);
  2251. }
  2252. const auto *Entry =
  2253. CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
  2254. assert(Entry && "Illegal Type for Splice");
  2255. LegalizationCost += Entry->Cost;
  2256. return LegalizationCost * LT.first;
  2257. }
  2258. InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
  2259. VectorType *Tp,
  2260. ArrayRef<int> Mask, int Index,
  2261. VectorType *SubTp) {
  2262. Kind = improveShuffleKindFromMask(Kind, Mask);
  2263. if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
  2264. Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
  2265. Kind == TTI::SK_Reverse) {
  2266. static const CostTblEntry ShuffleTbl[] = {
  2267. // Broadcast shuffle kinds can be performed with 'dup'.
  2268. { TTI::SK_Broadcast, MVT::v8i8, 1 },
  2269. { TTI::SK_Broadcast, MVT::v16i8, 1 },
  2270. { TTI::SK_Broadcast, MVT::v4i16, 1 },
  2271. { TTI::SK_Broadcast, MVT::v8i16, 1 },
  2272. { TTI::SK_Broadcast, MVT::v2i32, 1 },
  2273. { TTI::SK_Broadcast, MVT::v4i32, 1 },
  2274. { TTI::SK_Broadcast, MVT::v2i64, 1 },
  2275. { TTI::SK_Broadcast, MVT::v2f32, 1 },
  2276. { TTI::SK_Broadcast, MVT::v4f32, 1 },
  2277. { TTI::SK_Broadcast, MVT::v2f64, 1 },
  2278. // Transpose shuffle kinds can be performed with 'trn1/trn2' and
  2279. // 'zip1/zip2' instructions.
  2280. { TTI::SK_Transpose, MVT::v8i8, 1 },
  2281. { TTI::SK_Transpose, MVT::v16i8, 1 },
  2282. { TTI::SK_Transpose, MVT::v4i16, 1 },
  2283. { TTI::SK_Transpose, MVT::v8i16, 1 },
  2284. { TTI::SK_Transpose, MVT::v2i32, 1 },
  2285. { TTI::SK_Transpose, MVT::v4i32, 1 },
  2286. { TTI::SK_Transpose, MVT::v2i64, 1 },
  2287. { TTI::SK_Transpose, MVT::v2f32, 1 },
  2288. { TTI::SK_Transpose, MVT::v4f32, 1 },
  2289. { TTI::SK_Transpose, MVT::v2f64, 1 },
  2290. // Select shuffle kinds.
  2291. // TODO: handle vXi8/vXi16.
  2292. { TTI::SK_Select, MVT::v2i32, 1 }, // mov.
  2293. { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
  2294. { TTI::SK_Select, MVT::v2i64, 1 }, // mov.
  2295. { TTI::SK_Select, MVT::v2f32, 1 }, // mov.
  2296. { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
  2297. { TTI::SK_Select, MVT::v2f64, 1 }, // mov.
  2298. // PermuteSingleSrc shuffle kinds.
  2299. { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
  2300. { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
  2301. { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
  2302. { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
  2303. { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
  2304. { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
  2305. { TTI::SK_PermuteSingleSrc, MVT::v4i16, 3 }, // perfectshuffle worst case.
  2306. { TTI::SK_PermuteSingleSrc, MVT::v4f16, 3 }, // perfectshuffle worst case.
  2307. { TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3 }, // perfectshuffle worst case.
  2308. { TTI::SK_PermuteSingleSrc, MVT::v8i16, 8 }, // constpool + load + tbl
  2309. { TTI::SK_PermuteSingleSrc, MVT::v8f16, 8 }, // constpool + load + tbl
  2310. { TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8 }, // constpool + load + tbl
  2311. { TTI::SK_PermuteSingleSrc, MVT::v8i8, 8 }, // constpool + load + tbl
  2312. { TTI::SK_PermuteSingleSrc, MVT::v16i8, 8 }, // constpool + load + tbl
  2313. // Reverse can be lowered with `rev`.
  2314. { TTI::SK_Reverse, MVT::v2i32, 1 }, // mov.
  2315. { TTI::SK_Reverse, MVT::v4i32, 2 }, // REV64; EXT
  2316. { TTI::SK_Reverse, MVT::v2i64, 1 }, // mov.
  2317. { TTI::SK_Reverse, MVT::v2f32, 1 }, // mov.
  2318. { TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT
  2319. { TTI::SK_Reverse, MVT::v2f64, 1 }, // mov.
  2320. // Broadcast shuffle kinds for scalable vectors
  2321. { TTI::SK_Broadcast, MVT::nxv16i8, 1 },
  2322. { TTI::SK_Broadcast, MVT::nxv8i16, 1 },
  2323. { TTI::SK_Broadcast, MVT::nxv4i32, 1 },
  2324. { TTI::SK_Broadcast, MVT::nxv2i64, 1 },
  2325. { TTI::SK_Broadcast, MVT::nxv2f16, 1 },
  2326. { TTI::SK_Broadcast, MVT::nxv4f16, 1 },
  2327. { TTI::SK_Broadcast, MVT::nxv8f16, 1 },
  2328. { TTI::SK_Broadcast, MVT::nxv2bf16, 1 },
  2329. { TTI::SK_Broadcast, MVT::nxv4bf16, 1 },
  2330. { TTI::SK_Broadcast, MVT::nxv8bf16, 1 },
  2331. { TTI::SK_Broadcast, MVT::nxv2f32, 1 },
  2332. { TTI::SK_Broadcast, MVT::nxv4f32, 1 },
  2333. { TTI::SK_Broadcast, MVT::nxv2f64, 1 },
  2334. { TTI::SK_Broadcast, MVT::nxv16i1, 1 },
  2335. { TTI::SK_Broadcast, MVT::nxv8i1, 1 },
  2336. { TTI::SK_Broadcast, MVT::nxv4i1, 1 },
  2337. { TTI::SK_Broadcast, MVT::nxv2i1, 1 },
  2338. // Handle the cases for vector.reverse with scalable vectors
  2339. { TTI::SK_Reverse, MVT::nxv16i8, 1 },
  2340. { TTI::SK_Reverse, MVT::nxv8i16, 1 },
  2341. { TTI::SK_Reverse, MVT::nxv4i32, 1 },
  2342. { TTI::SK_Reverse, MVT::nxv2i64, 1 },
  2343. { TTI::SK_Reverse, MVT::nxv2f16, 1 },
  2344. { TTI::SK_Reverse, MVT::nxv4f16, 1 },
  2345. { TTI::SK_Reverse, MVT::nxv8f16, 1 },
  2346. { TTI::SK_Reverse, MVT::nxv2bf16, 1 },
  2347. { TTI::SK_Reverse, MVT::nxv4bf16, 1 },
  2348. { TTI::SK_Reverse, MVT::nxv8bf16, 1 },
  2349. { TTI::SK_Reverse, MVT::nxv2f32, 1 },
  2350. { TTI::SK_Reverse, MVT::nxv4f32, 1 },
  2351. { TTI::SK_Reverse, MVT::nxv2f64, 1 },
  2352. { TTI::SK_Reverse, MVT::nxv16i1, 1 },
  2353. { TTI::SK_Reverse, MVT::nxv8i1, 1 },
  2354. { TTI::SK_Reverse, MVT::nxv4i1, 1 },
  2355. { TTI::SK_Reverse, MVT::nxv2i1, 1 },
  2356. };
  2357. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
  2358. if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
  2359. return LT.first * Entry->Cost;
  2360. }
  2361. if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
  2362. return getSpliceCost(Tp, Index);
  2363. return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
  2364. }