ARMTargetTransformInfo.cpp 95 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443
  1. //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #include "ARMTargetTransformInfo.h"
  9. #include "ARMSubtarget.h"
  10. #include "MCTargetDesc/ARMAddressingModes.h"
  11. #include "llvm/ADT/APInt.h"
  12. #include "llvm/ADT/SmallVector.h"
  13. #include "llvm/Analysis/LoopInfo.h"
  14. #include "llvm/CodeGen/CostTable.h"
  15. #include "llvm/CodeGen/ISDOpcodes.h"
  16. #include "llvm/CodeGen/ValueTypes.h"
  17. #include "llvm/IR/BasicBlock.h"
  18. #include "llvm/IR/DataLayout.h"
  19. #include "llvm/IR/DerivedTypes.h"
  20. #include "llvm/IR/Instruction.h"
  21. #include "llvm/IR/Instructions.h"
  22. #include "llvm/IR/IntrinsicInst.h"
  23. #include "llvm/IR/Intrinsics.h"
  24. #include "llvm/IR/IntrinsicsARM.h"
  25. #include "llvm/IR/PatternMatch.h"
  26. #include "llvm/IR/Type.h"
  27. #include "llvm/MC/SubtargetFeature.h"
  28. #include "llvm/Support/Casting.h"
  29. #include "llvm/Support/KnownBits.h"
  30. #include "llvm/Support/MachineValueType.h"
  31. #include "llvm/Target/TargetMachine.h"
  32. #include "llvm/Transforms/InstCombine/InstCombiner.h"
  33. #include "llvm/Transforms/Utils/Local.h"
  34. #include "llvm/Transforms/Utils/LoopUtils.h"
  35. #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
  36. #include <algorithm>
  37. #include <cassert>
  38. #include <cstdint>
  39. #include <optional>
  40. #include <utility>
  41. using namespace llvm;
  42. #define DEBUG_TYPE "armtti"
  43. static cl::opt<bool> EnableMaskedLoadStores(
  44. "enable-arm-maskedldst", cl::Hidden, cl::init(true),
  45. cl::desc("Enable the generation of masked loads and stores"));
  46. static cl::opt<bool> DisableLowOverheadLoops(
  47. "disable-arm-loloops", cl::Hidden, cl::init(false),
  48. cl::desc("Disable the generation of low-overhead loops"));
  49. static cl::opt<bool>
  50. AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
  51. cl::desc("Enable the generation of WLS loops"));
  52. extern cl::opt<TailPredication::Mode> EnableTailPredication;
  53. extern cl::opt<bool> EnableMaskedGatherScatters;
  54. extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
  55. /// Convert a vector load intrinsic into a simple llvm load instruction.
  56. /// This is beneficial when the underlying object being addressed comes
  57. /// from a constant, since we get constant-folding for free.
  58. static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
  59. InstCombiner::BuilderTy &Builder) {
  60. auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
  61. if (!IntrAlign)
  62. return nullptr;
  63. unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
  64. ? MemAlign
  65. : IntrAlign->getLimitedValue();
  66. if (!isPowerOf2_32(Alignment))
  67. return nullptr;
  68. auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
  69. PointerType::get(II.getType(), 0));
  70. return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
  71. }
  72. bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
  73. const Function *Callee) const {
  74. const TargetMachine &TM = getTLI()->getTargetMachine();
  75. const FeatureBitset &CallerBits =
  76. TM.getSubtargetImpl(*Caller)->getFeatureBits();
  77. const FeatureBitset &CalleeBits =
  78. TM.getSubtargetImpl(*Callee)->getFeatureBits();
  79. // To inline a callee, all features not in the allowed list must match exactly.
  80. bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
  81. (CalleeBits & ~InlineFeaturesAllowed);
  82. // For features in the allowed list, the callee's features must be a subset of
  83. // the callers'.
  84. bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
  85. (CalleeBits & InlineFeaturesAllowed);
  86. return MatchExact && MatchSubset;
  87. }
  88. TTI::AddressingModeKind
  89. ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
  90. ScalarEvolution *SE) const {
  91. if (ST->hasMVEIntegerOps())
  92. return TTI::AMK_PostIndexed;
  93. if (L->getHeader()->getParent()->hasOptSize())
  94. return TTI::AMK_None;
  95. if (ST->isMClass() && ST->isThumb2() &&
  96. L->getNumBlocks() == 1)
  97. return TTI::AMK_PreIndexed;
  98. return TTI::AMK_None;
  99. }
  100. std::optional<Instruction *>
  101. ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
  102. using namespace PatternMatch;
  103. Intrinsic::ID IID = II.getIntrinsicID();
  104. switch (IID) {
  105. default:
  106. break;
  107. case Intrinsic::arm_neon_vld1: {
  108. Align MemAlign =
  109. getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
  110. &IC.getAssumptionCache(), &IC.getDominatorTree());
  111. if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
  112. return IC.replaceInstUsesWith(II, V);
  113. }
  114. break;
  115. }
  116. case Intrinsic::arm_neon_vld2:
  117. case Intrinsic::arm_neon_vld3:
  118. case Intrinsic::arm_neon_vld4:
  119. case Intrinsic::arm_neon_vld2lane:
  120. case Intrinsic::arm_neon_vld3lane:
  121. case Intrinsic::arm_neon_vld4lane:
  122. case Intrinsic::arm_neon_vst1:
  123. case Intrinsic::arm_neon_vst2:
  124. case Intrinsic::arm_neon_vst3:
  125. case Intrinsic::arm_neon_vst4:
  126. case Intrinsic::arm_neon_vst2lane:
  127. case Intrinsic::arm_neon_vst3lane:
  128. case Intrinsic::arm_neon_vst4lane: {
  129. Align MemAlign =
  130. getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
  131. &IC.getAssumptionCache(), &IC.getDominatorTree());
  132. unsigned AlignArg = II.arg_size() - 1;
  133. Value *AlignArgOp = II.getArgOperand(AlignArg);
  134. MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
  135. if (Align && *Align < MemAlign) {
  136. return IC.replaceOperand(
  137. II, AlignArg,
  138. ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
  139. false));
  140. }
  141. break;
  142. }
  143. case Intrinsic::arm_mve_pred_i2v: {
  144. Value *Arg = II.getArgOperand(0);
  145. Value *ArgArg;
  146. if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
  147. PatternMatch::m_Value(ArgArg))) &&
  148. II.getType() == ArgArg->getType()) {
  149. return IC.replaceInstUsesWith(II, ArgArg);
  150. }
  151. Constant *XorMask;
  152. if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
  153. PatternMatch::m_Value(ArgArg)),
  154. PatternMatch::m_Constant(XorMask))) &&
  155. II.getType() == ArgArg->getType()) {
  156. if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
  157. if (CI->getValue().trunc(16).isAllOnes()) {
  158. auto TrueVector = IC.Builder.CreateVectorSplat(
  159. cast<FixedVectorType>(II.getType())->getNumElements(),
  160. IC.Builder.getTrue());
  161. return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
  162. }
  163. }
  164. }
  165. KnownBits ScalarKnown(32);
  166. if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
  167. ScalarKnown, 0)) {
  168. return &II;
  169. }
  170. break;
  171. }
  172. case Intrinsic::arm_mve_pred_v2i: {
  173. Value *Arg = II.getArgOperand(0);
  174. Value *ArgArg;
  175. if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
  176. PatternMatch::m_Value(ArgArg)))) {
  177. return IC.replaceInstUsesWith(II, ArgArg);
  178. }
  179. if (!II.getMetadata(LLVMContext::MD_range)) {
  180. Type *IntTy32 = Type::getInt32Ty(II.getContext());
  181. Metadata *M[] = {
  182. ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
  183. ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
  184. II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
  185. return &II;
  186. }
  187. break;
  188. }
  189. case Intrinsic::arm_mve_vadc:
  190. case Intrinsic::arm_mve_vadc_predicated: {
  191. unsigned CarryOp =
  192. (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
  193. assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
  194. "Bad type for intrinsic!");
  195. KnownBits CarryKnown(32);
  196. if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
  197. CarryKnown)) {
  198. return &II;
  199. }
  200. break;
  201. }
  202. case Intrinsic::arm_mve_vmldava: {
  203. Instruction *I = cast<Instruction>(&II);
  204. if (I->hasOneUse()) {
  205. auto *User = cast<Instruction>(*I->user_begin());
  206. Value *OpZ;
  207. if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
  208. match(I->getOperand(3), m_Zero())) {
  209. Value *OpX = I->getOperand(4);
  210. Value *OpY = I->getOperand(5);
  211. Type *OpTy = OpX->getType();
  212. IC.Builder.SetInsertPoint(User);
  213. Value *V =
  214. IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
  215. {I->getOperand(0), I->getOperand(1),
  216. I->getOperand(2), OpZ, OpX, OpY});
  217. IC.replaceInstUsesWith(*User, V);
  218. return IC.eraseInstFromFunction(*User);
  219. }
  220. }
  221. return std::nullopt;
  222. }
  223. }
  224. return std::nullopt;
  225. }
  226. std::optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic(
  227. InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
  228. APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
  229. std::function<void(Instruction *, unsigned, APInt, APInt &)>
  230. SimplifyAndSetOp) const {
  231. // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
  232. // opcode specifying a Top/Bottom instruction, which can change between
  233. // instructions.
  234. auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
  235. unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
  236. unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
  237. // The only odd/even lanes of operand 0 will only be demanded depending
  238. // on whether this is a top/bottom instruction.
  239. APInt DemandedElts =
  240. APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
  241. : APInt::getHighBitsSet(2, 1));
  242. SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
  243. // The other lanes will be defined from the inserted elements.
  244. UndefElts &= APInt::getSplat(NumElts, !IsTop ? APInt::getLowBitsSet(2, 1)
  245. : APInt::getHighBitsSet(2, 1));
  246. return std::nullopt;
  247. };
  248. switch (II.getIntrinsicID()) {
  249. default:
  250. break;
  251. case Intrinsic::arm_mve_vcvt_narrow:
  252. SimplifyNarrowInstrTopBottom(2);
  253. break;
  254. case Intrinsic::arm_mve_vqmovn:
  255. SimplifyNarrowInstrTopBottom(4);
  256. break;
  257. case Intrinsic::arm_mve_vshrn:
  258. SimplifyNarrowInstrTopBottom(7);
  259. break;
  260. }
  261. return std::nullopt;
  262. }
  263. InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
  264. TTI::TargetCostKind CostKind) {
  265. assert(Ty->isIntegerTy());
  266. unsigned Bits = Ty->getPrimitiveSizeInBits();
  267. if (Bits == 0 || Imm.getActiveBits() >= 64)
  268. return 4;
  269. int64_t SImmVal = Imm.getSExtValue();
  270. uint64_t ZImmVal = Imm.getZExtValue();
  271. if (!ST->isThumb()) {
  272. if ((SImmVal >= 0 && SImmVal < 65536) ||
  273. (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
  274. (ARM_AM::getSOImmVal(~ZImmVal) != -1))
  275. return 1;
  276. return ST->hasV6T2Ops() ? 2 : 3;
  277. }
  278. if (ST->isThumb2()) {
  279. if ((SImmVal >= 0 && SImmVal < 65536) ||
  280. (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
  281. (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
  282. return 1;
  283. return ST->hasV6T2Ops() ? 2 : 3;
  284. }
  285. // Thumb1, any i8 imm cost 1.
  286. if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
  287. return 1;
  288. if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
  289. return 2;
  290. // Load from constantpool.
  291. return 3;
  292. }
  293. // Constants smaller than 256 fit in the immediate field of
  294. // Thumb1 instructions so we return a zero cost and 1 otherwise.
  295. InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
  296. const APInt &Imm, Type *Ty) {
  297. if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
  298. return 0;
  299. return 1;
  300. }
  301. // Checks whether Inst is part of a min(max()) or max(min()) pattern
  302. // that will match to an SSAT instruction. Returns the instruction being
  303. // saturated, or null if no saturation pattern was found.
  304. static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
  305. Value *LHS, *RHS;
  306. ConstantInt *C;
  307. SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
  308. if (InstSPF == SPF_SMAX &&
  309. PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
  310. C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
  311. auto isSSatMin = [&](Value *MinInst) {
  312. if (isa<SelectInst>(MinInst)) {
  313. Value *MinLHS, *MinRHS;
  314. ConstantInt *MinC;
  315. SelectPatternFlavor MinSPF =
  316. matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
  317. if (MinSPF == SPF_SMIN &&
  318. PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
  319. MinC->getValue() == ((-Imm) - 1))
  320. return true;
  321. }
  322. return false;
  323. };
  324. if (isSSatMin(Inst->getOperand(1)))
  325. return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
  326. if (Inst->hasNUses(2) &&
  327. (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
  328. return Inst->getOperand(1);
  329. }
  330. return nullptr;
  331. }
  332. // Look for a FP Saturation pattern, where the instruction can be simplified to
  333. // a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
  334. static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
  335. if (Imm.getBitWidth() != 64 ||
  336. Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
  337. return false;
  338. Value *FP = isSSATMinMaxPattern(Inst, Imm);
  339. if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
  340. FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm);
  341. if (!FP)
  342. return false;
  343. return isa<FPToSIInst>(FP);
  344. }
  345. InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
  346. const APInt &Imm, Type *Ty,
  347. TTI::TargetCostKind CostKind,
  348. Instruction *Inst) {
  349. // Division by a constant can be turned into multiplication, but only if we
  350. // know it's constant. So it's not so much that the immediate is cheap (it's
  351. // not), but that the alternative is worse.
  352. // FIXME: this is probably unneeded with GlobalISel.
  353. if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
  354. Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
  355. Idx == 1)
  356. return 0;
  357. // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
  358. // splitting any large offsets.
  359. if (Opcode == Instruction::GetElementPtr && Idx != 0)
  360. return 0;
  361. if (Opcode == Instruction::And) {
  362. // UXTB/UXTH
  363. if (Imm == 255 || Imm == 65535)
  364. return 0;
  365. // Conversion to BIC is free, and means we can use ~Imm instead.
  366. return std::min(getIntImmCost(Imm, Ty, CostKind),
  367. getIntImmCost(~Imm, Ty, CostKind));
  368. }
  369. if (Opcode == Instruction::Add)
  370. // Conversion to SUB is free, and means we can use -Imm instead.
  371. return std::min(getIntImmCost(Imm, Ty, CostKind),
  372. getIntImmCost(-Imm, Ty, CostKind));
  373. if (Opcode == Instruction::ICmp && Imm.isNegative() &&
  374. Ty->getIntegerBitWidth() == 32) {
  375. int64_t NegImm = -Imm.getSExtValue();
  376. if (ST->isThumb2() && NegImm < 1<<12)
  377. // icmp X, #-C -> cmn X, #C
  378. return 0;
  379. if (ST->isThumb() && NegImm < 1<<8)
  380. // icmp X, #-C -> adds X, #C
  381. return 0;
  382. }
  383. // xor a, -1 can always be folded to MVN
  384. if (Opcode == Instruction::Xor && Imm.isAllOnes())
  385. return 0;
  386. // Ensures negative constant of min(max()) or max(min()) patterns that
  387. // match to SSAT instructions don't get hoisted
  388. if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
  389. Ty->getIntegerBitWidth() <= 32) {
  390. if (isSSATMinMaxPattern(Inst, Imm) ||
  391. (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
  392. isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
  393. return 0;
  394. }
  395. if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
  396. return 0;
  397. // We can convert <= -1 to < 0, which is generally quite cheap.
  398. if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnesValue()) {
  399. ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
  400. if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
  401. return std::min(getIntImmCost(Imm, Ty, CostKind),
  402. getIntImmCost(Imm + 1, Ty, CostKind));
  403. }
  404. return getIntImmCost(Imm, Ty, CostKind);
  405. }
  406. InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
  407. TTI::TargetCostKind CostKind,
  408. const Instruction *I) {
  409. if (CostKind == TTI::TCK_RecipThroughput &&
  410. (ST->hasNEON() || ST->hasMVEIntegerOps())) {
  411. // FIXME: The vectorizer is highly sensistive to the cost of these
  412. // instructions, which suggests that it may be using the costs incorrectly.
  413. // But, for now, just make them free to avoid performance regressions for
  414. // vector targets.
  415. return 0;
  416. }
  417. return BaseT::getCFInstrCost(Opcode, CostKind, I);
  418. }
  419. InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
  420. Type *Src,
  421. TTI::CastContextHint CCH,
  422. TTI::TargetCostKind CostKind,
  423. const Instruction *I) {
  424. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  425. assert(ISD && "Invalid opcode");
  426. // TODO: Allow non-throughput costs that aren't binary.
  427. auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
  428. if (CostKind != TTI::TCK_RecipThroughput)
  429. return Cost == 0 ? 0 : 1;
  430. return Cost;
  431. };
  432. auto IsLegalFPType = [this](EVT VT) {
  433. EVT EltVT = VT.getScalarType();
  434. return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
  435. (EltVT == MVT::f64 && ST->hasFP64()) ||
  436. (EltVT == MVT::f16 && ST->hasFullFP16());
  437. };
  438. EVT SrcTy = TLI->getValueType(DL, Src);
  439. EVT DstTy = TLI->getValueType(DL, Dst);
  440. if (!SrcTy.isSimple() || !DstTy.isSimple())
  441. return AdjustCost(
  442. BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
  443. // Extending masked load/Truncating masked stores is expensive because we
  444. // currently don't split them. This means that we'll likely end up
  445. // loading/storing each element individually (hence the high cost).
  446. if ((ST->hasMVEIntegerOps() &&
  447. (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
  448. Opcode == Instruction::SExt)) ||
  449. (ST->hasMVEFloatOps() &&
  450. (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
  451. IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
  452. if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
  453. return 2 * DstTy.getVectorNumElements() *
  454. ST->getMVEVectorCostFactor(CostKind);
  455. // The extend of other kinds of load is free
  456. if (CCH == TTI::CastContextHint::Normal ||
  457. CCH == TTI::CastContextHint::Masked) {
  458. static const TypeConversionCostTblEntry LoadConversionTbl[] = {
  459. {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
  460. {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
  461. {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
  462. {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
  463. {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
  464. {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
  465. {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
  466. {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
  467. {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
  468. {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
  469. {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
  470. {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
  471. };
  472. if (const auto *Entry = ConvertCostTableLookup(
  473. LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
  474. return AdjustCost(Entry->Cost);
  475. static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
  476. {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
  477. {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
  478. {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
  479. {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
  480. {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
  481. {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
  482. // The following extend from a legal type to an illegal type, so need to
  483. // split the load. This introduced an extra load operation, but the
  484. // extend is still "free".
  485. {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
  486. {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
  487. {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
  488. {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
  489. {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
  490. {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
  491. };
  492. if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
  493. if (const auto *Entry =
  494. ConvertCostTableLookup(MVELoadConversionTbl, ISD,
  495. DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
  496. return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
  497. }
  498. static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
  499. // FPExtends are similar but also require the VCVT instructions.
  500. {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
  501. {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
  502. };
  503. if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
  504. if (const auto *Entry =
  505. ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
  506. DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
  507. return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
  508. }
  509. // The truncate of a store is free. This is the mirror of extends above.
  510. static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
  511. {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
  512. {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
  513. {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
  514. {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
  515. {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
  516. {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
  517. {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
  518. };
  519. if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
  520. if (const auto *Entry =
  521. ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
  522. SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
  523. return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
  524. }
  525. static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
  526. {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
  527. {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
  528. };
  529. if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
  530. if (const auto *Entry =
  531. ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
  532. SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
  533. return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
  534. }
  535. }
  536. // NEON vector operations that can extend their inputs.
  537. if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
  538. I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
  539. static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
  540. // vaddl
  541. { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
  542. { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
  543. // vsubl
  544. { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
  545. { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
  546. // vmull
  547. { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
  548. { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
  549. // vshll
  550. { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
  551. { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
  552. };
  553. auto *User = cast<Instruction>(*I->user_begin());
  554. int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
  555. if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
  556. DstTy.getSimpleVT(),
  557. SrcTy.getSimpleVT())) {
  558. return AdjustCost(Entry->Cost);
  559. }
  560. }
  561. // Single to/from double precision conversions.
  562. if (Src->isVectorTy() && ST->hasNEON() &&
  563. ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
  564. DstTy.getScalarType() == MVT::f32) ||
  565. (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
  566. DstTy.getScalarType() == MVT::f64))) {
  567. static const CostTblEntry NEONFltDblTbl[] = {
  568. // Vector fptrunc/fpext conversions.
  569. {ISD::FP_ROUND, MVT::v2f64, 2},
  570. {ISD::FP_EXTEND, MVT::v2f32, 2},
  571. {ISD::FP_EXTEND, MVT::v4f32, 4}};
  572. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
  573. if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
  574. return AdjustCost(LT.first * Entry->Cost);
  575. }
  576. // Some arithmetic, load and store operations have specific instructions
  577. // to cast up/down their types automatically at no extra cost.
  578. // TODO: Get these tables to know at least what the related operations are.
  579. static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
  580. { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
  581. { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
  582. { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
  583. { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
  584. { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
  585. { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
  586. // The number of vmovl instructions for the extension.
  587. { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
  588. { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
  589. { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
  590. { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
  591. { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
  592. { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
  593. { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
  594. { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
  595. { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
  596. { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
  597. { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
  598. { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
  599. { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
  600. { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
  601. { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
  602. { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
  603. { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
  604. { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
  605. // Operations that we legalize using splitting.
  606. { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
  607. { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
  608. // Vector float <-> i32 conversions.
  609. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
  610. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
  611. { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
  612. { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
  613. { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
  614. { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
  615. { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
  616. { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
  617. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
  618. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
  619. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
  620. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
  621. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
  622. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
  623. { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
  624. { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
  625. { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
  626. { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
  627. { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
  628. { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
  629. { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
  630. { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
  631. { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
  632. { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
  633. { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
  634. { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
  635. { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
  636. { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
  637. // Vector double <-> i32 conversions.
  638. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
  639. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
  640. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
  641. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
  642. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
  643. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
  644. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
  645. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
  646. { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
  647. { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
  648. { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
  649. { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
  650. { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
  651. { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
  652. };
  653. if (SrcTy.isVector() && ST->hasNEON()) {
  654. if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
  655. DstTy.getSimpleVT(),
  656. SrcTy.getSimpleVT()))
  657. return AdjustCost(Entry->Cost);
  658. }
  659. // Scalar float to integer conversions.
  660. static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
  661. { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
  662. { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
  663. { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
  664. { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
  665. { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
  666. { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
  667. { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
  668. { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
  669. { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
  670. { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
  671. { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
  672. { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
  673. { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
  674. { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
  675. { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
  676. { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
  677. { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
  678. { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
  679. { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
  680. { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
  681. };
  682. if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
  683. if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
  684. DstTy.getSimpleVT(),
  685. SrcTy.getSimpleVT()))
  686. return AdjustCost(Entry->Cost);
  687. }
  688. // Scalar integer to float conversions.
  689. static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
  690. { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
  691. { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
  692. { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
  693. { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
  694. { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
  695. { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
  696. { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
  697. { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
  698. { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
  699. { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
  700. { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
  701. { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
  702. { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
  703. { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
  704. { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
  705. { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
  706. { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
  707. { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
  708. { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
  709. { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
  710. };
  711. if (SrcTy.isInteger() && ST->hasNEON()) {
  712. if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
  713. ISD, DstTy.getSimpleVT(),
  714. SrcTy.getSimpleVT()))
  715. return AdjustCost(Entry->Cost);
  716. }
  717. // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
  718. // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
  719. // are linearised so take more.
  720. static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
  721. { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
  722. { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
  723. { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
  724. { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
  725. { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
  726. { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
  727. { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
  728. { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
  729. { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
  730. { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
  731. { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
  732. { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
  733. };
  734. if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
  735. if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
  736. ISD, DstTy.getSimpleVT(),
  737. SrcTy.getSimpleVT()))
  738. return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
  739. }
  740. if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
  741. // As general rule, fp converts that were not matched above are scalarized
  742. // and cost 1 vcvt for each lane, so long as the instruction is available.
  743. // If not it will become a series of function calls.
  744. const InstructionCost CallCost =
  745. getCallInstrCost(nullptr, Dst, {Src}, CostKind);
  746. int Lanes = 1;
  747. if (SrcTy.isFixedLengthVector())
  748. Lanes = SrcTy.getVectorNumElements();
  749. if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
  750. return Lanes;
  751. else
  752. return Lanes * CallCost;
  753. }
  754. if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
  755. SrcTy.isFixedLengthVector()) {
  756. // Treat a truncate with larger than legal source (128bits for MVE) as
  757. // expensive, 2 instructions per lane.
  758. if ((SrcTy.getScalarType() == MVT::i8 ||
  759. SrcTy.getScalarType() == MVT::i16 ||
  760. SrcTy.getScalarType() == MVT::i32) &&
  761. SrcTy.getSizeInBits() > 128 &&
  762. SrcTy.getSizeInBits() > DstTy.getSizeInBits())
  763. return SrcTy.getVectorNumElements() * 2;
  764. }
  765. // Scalar integer conversion costs.
  766. static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
  767. // i16 -> i64 requires two dependent operations.
  768. { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
  769. // Truncates on i64 are assumed to be free.
  770. { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
  771. { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
  772. { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
  773. { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
  774. };
  775. if (SrcTy.isInteger()) {
  776. if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
  777. DstTy.getSimpleVT(),
  778. SrcTy.getSimpleVT()))
  779. return AdjustCost(Entry->Cost);
  780. }
  781. int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
  782. ? ST->getMVEVectorCostFactor(CostKind)
  783. : 1;
  784. return AdjustCost(
  785. BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
  786. }
  787. InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
  788. TTI::TargetCostKind CostKind,
  789. unsigned Index, Value *Op0,
  790. Value *Op1) {
  791. // Penalize inserting into an D-subregister. We end up with a three times
  792. // lower estimated throughput on swift.
  793. if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
  794. ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
  795. return 3;
  796. if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
  797. Opcode == Instruction::ExtractElement)) {
  798. // Cross-class copies are expensive on many microarchitectures,
  799. // so assume they are expensive by default.
  800. if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
  801. return 3;
  802. // Even if it's not a cross class copy, this likely leads to mixing
  803. // of NEON and VFP code and should be therefore penalized.
  804. if (ValTy->isVectorTy() &&
  805. ValTy->getScalarSizeInBits() <= 32)
  806. return std::max<InstructionCost>(
  807. BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1),
  808. 2U);
  809. }
  810. if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
  811. Opcode == Instruction::ExtractElement)) {
  812. // Integer cross-lane moves are more expensive than float, which can
  813. // sometimes just be vmovs. Integer involve being passes to GPR registers,
  814. // causing more of a delay.
  815. std::pair<InstructionCost, MVT> LT =
  816. getTypeLegalizationCost(ValTy->getScalarType());
  817. return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
  818. }
  819. return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
  820. }
  821. InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
  822. Type *CondTy,
  823. CmpInst::Predicate VecPred,
  824. TTI::TargetCostKind CostKind,
  825. const Instruction *I) {
  826. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  827. // Thumb scalar code size cost for select.
  828. if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
  829. ST->isThumb() && !ValTy->isVectorTy()) {
  830. // Assume expensive structs.
  831. if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
  832. return TTI::TCC_Expensive;
  833. // Select costs can vary because they:
  834. // - may require one or more conditional mov (including an IT),
  835. // - can't operate directly on immediates,
  836. // - require live flags, which we can't copy around easily.
  837. InstructionCost Cost = getTypeLegalizationCost(ValTy).first;
  838. // Possible IT instruction for Thumb2, or more for Thumb1.
  839. ++Cost;
  840. // i1 values may need rematerialising by using mov immediates and/or
  841. // flag setting instructions.
  842. if (ValTy->isIntegerTy(1))
  843. ++Cost;
  844. return Cost;
  845. }
  846. // If this is a vector min/max/abs, use the cost of that intrinsic directly
  847. // instead. Hopefully when min/max intrinsics are more prevalent this code
  848. // will not be needed.
  849. const Instruction *Sel = I;
  850. if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
  851. Sel->hasOneUse())
  852. Sel = cast<Instruction>(Sel->user_back());
  853. if (Sel && ValTy->isVectorTy() &&
  854. (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
  855. const Value *LHS, *RHS;
  856. SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
  857. unsigned IID = 0;
  858. switch (SPF) {
  859. case SPF_ABS:
  860. IID = Intrinsic::abs;
  861. break;
  862. case SPF_SMIN:
  863. IID = Intrinsic::smin;
  864. break;
  865. case SPF_SMAX:
  866. IID = Intrinsic::smax;
  867. break;
  868. case SPF_UMIN:
  869. IID = Intrinsic::umin;
  870. break;
  871. case SPF_UMAX:
  872. IID = Intrinsic::umax;
  873. break;
  874. case SPF_FMINNUM:
  875. IID = Intrinsic::minnum;
  876. break;
  877. case SPF_FMAXNUM:
  878. IID = Intrinsic::maxnum;
  879. break;
  880. default:
  881. break;
  882. }
  883. if (IID) {
  884. // The ICmp is free, the select gets the cost of the min/max/etc
  885. if (Sel != I)
  886. return 0;
  887. IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
  888. return getIntrinsicInstrCost(CostAttrs, CostKind);
  889. }
  890. }
  891. // On NEON a vector select gets lowered to vbsl.
  892. if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
  893. // Lowering of some vector selects is currently far from perfect.
  894. static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
  895. { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
  896. { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
  897. { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
  898. };
  899. EVT SelCondTy = TLI->getValueType(DL, CondTy);
  900. EVT SelValTy = TLI->getValueType(DL, ValTy);
  901. if (SelCondTy.isSimple() && SelValTy.isSimple()) {
  902. if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
  903. SelCondTy.getSimpleVT(),
  904. SelValTy.getSimpleVT()))
  905. return Entry->Cost;
  906. }
  907. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
  908. return LT.first;
  909. }
  910. if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
  911. (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
  912. cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
  913. FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
  914. FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
  915. if (!VecCondTy)
  916. VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
  917. // If we don't have mve.fp any fp operations will need to be scalarized.
  918. if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
  919. // One scalaization insert, one scalarization extract and the cost of the
  920. // fcmps.
  921. return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false,
  922. /*Extract*/ true, CostKind) +
  923. BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
  924. /*Extract*/ false, CostKind) +
  925. VecValTy->getNumElements() *
  926. getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
  927. VecCondTy->getScalarType(), VecPred,
  928. CostKind, I);
  929. }
  930. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
  931. int BaseCost = ST->getMVEVectorCostFactor(CostKind);
  932. // There are two types - the input that specifies the type of the compare
  933. // and the output vXi1 type. Because we don't know how the output will be
  934. // split, we may need an expensive shuffle to get two in sync. This has the
  935. // effect of making larger than legal compares (v8i32 for example)
  936. // expensive.
  937. if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
  938. if (LT.first > 1)
  939. return LT.first * BaseCost +
  940. BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
  941. /*Extract*/ false, CostKind);
  942. return BaseCost;
  943. }
  944. }
  945. // Default to cheap (throughput/size of 1 instruction) but adjust throughput
  946. // for "multiple beats" potentially needed by MVE instructions.
  947. int BaseCost = 1;
  948. if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
  949. BaseCost = ST->getMVEVectorCostFactor(CostKind);
  950. return BaseCost *
  951. BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
  952. }
  953. InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
  954. ScalarEvolution *SE,
  955. const SCEV *Ptr) {
  956. // Address computations in vectorized code with non-consecutive addresses will
  957. // likely result in more instructions compared to scalar code where the
  958. // computation can more often be merged into the index mode. The resulting
  959. // extra micro-ops can significantly decrease throughput.
  960. unsigned NumVectorInstToHideOverhead = 10;
  961. int MaxMergeDistance = 64;
  962. if (ST->hasNEON()) {
  963. if (Ty->isVectorTy() && SE &&
  964. !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
  965. return NumVectorInstToHideOverhead;
  966. // In many cases the address computation is not merged into the instruction
  967. // addressing mode.
  968. return 1;
  969. }
  970. return BaseT::getAddressComputationCost(Ty, SE, Ptr);
  971. }
  972. bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
  973. if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
  974. // If a VCTP is part of a chain, it's already profitable and shouldn't be
  975. // optimized, else LSR may block tail-predication.
  976. switch (II->getIntrinsicID()) {
  977. case Intrinsic::arm_mve_vctp8:
  978. case Intrinsic::arm_mve_vctp16:
  979. case Intrinsic::arm_mve_vctp32:
  980. case Intrinsic::arm_mve_vctp64:
  981. return true;
  982. default:
  983. break;
  984. }
  985. }
  986. return false;
  987. }
  988. bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
  989. if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
  990. return false;
  991. if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
  992. // Don't support v2i1 yet.
  993. if (VecTy->getNumElements() == 2)
  994. return false;
  995. // We don't support extending fp types.
  996. unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
  997. if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
  998. return false;
  999. }
  1000. unsigned EltWidth = DataTy->getScalarSizeInBits();
  1001. return (EltWidth == 32 && Alignment >= 4) ||
  1002. (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
  1003. }
  1004. bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
  1005. if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
  1006. return false;
  1007. unsigned EltWidth = Ty->getScalarSizeInBits();
  1008. return ((EltWidth == 32 && Alignment >= 4) ||
  1009. (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
  1010. }
  1011. /// Given a memcpy/memset/memmove instruction, return the number of memory
  1012. /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
  1013. /// call is used.
  1014. int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
  1015. MemOp MOp;
  1016. unsigned DstAddrSpace = ~0u;
  1017. unsigned SrcAddrSpace = ~0u;
  1018. const Function *F = I->getParent()->getParent();
  1019. if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
  1020. ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
  1021. // If 'size' is not a constant, a library call will be generated.
  1022. if (!C)
  1023. return -1;
  1024. const unsigned Size = C->getValue().getZExtValue();
  1025. const Align DstAlign = *MC->getDestAlign();
  1026. const Align SrcAlign = *MC->getSourceAlign();
  1027. MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
  1028. /*IsVolatile*/ false);
  1029. DstAddrSpace = MC->getDestAddressSpace();
  1030. SrcAddrSpace = MC->getSourceAddressSpace();
  1031. }
  1032. else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
  1033. ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
  1034. // If 'size' is not a constant, a library call will be generated.
  1035. if (!C)
  1036. return -1;
  1037. const unsigned Size = C->getValue().getZExtValue();
  1038. const Align DstAlign = *MS->getDestAlign();
  1039. MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
  1040. /*IsZeroMemset*/ false, /*IsVolatile*/ false);
  1041. DstAddrSpace = MS->getDestAddressSpace();
  1042. }
  1043. else
  1044. llvm_unreachable("Expected a memcpy/move or memset!");
  1045. unsigned Limit, Factor = 2;
  1046. switch(I->getIntrinsicID()) {
  1047. case Intrinsic::memcpy:
  1048. Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
  1049. break;
  1050. case Intrinsic::memmove:
  1051. Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
  1052. break;
  1053. case Intrinsic::memset:
  1054. Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
  1055. Factor = 1;
  1056. break;
  1057. default:
  1058. llvm_unreachable("Expected a memcpy/move or memset!");
  1059. }
  1060. // MemOps will be poplulated with a list of data types that needs to be
  1061. // loaded and stored. That's why we multiply the number of elements by 2 to
  1062. // get the cost for this memcpy.
  1063. std::vector<EVT> MemOps;
  1064. if (getTLI()->findOptimalMemOpLowering(
  1065. MemOps, Limit, MOp, DstAddrSpace,
  1066. SrcAddrSpace, F->getAttributes()))
  1067. return MemOps.size() * Factor;
  1068. // If we can't find an optimal memop lowering, return the default cost
  1069. return -1;
  1070. }
  1071. InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
  1072. int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
  1073. // To model the cost of a library call, we assume 1 for the call, and
  1074. // 3 for the argument setup.
  1075. if (NumOps == -1)
  1076. return 4;
  1077. return NumOps;
  1078. }
  1079. InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
  1080. VectorType *Tp, ArrayRef<int> Mask,
  1081. TTI::TargetCostKind CostKind,
  1082. int Index, VectorType *SubTp,
  1083. ArrayRef<const Value *> Args) {
  1084. Kind = improveShuffleKindFromMask(Kind, Mask);
  1085. if (ST->hasNEON()) {
  1086. if (Kind == TTI::SK_Broadcast) {
  1087. static const CostTblEntry NEONDupTbl[] = {
  1088. // VDUP handles these cases.
  1089. {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
  1090. {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
  1091. {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
  1092. {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
  1093. {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
  1094. {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
  1095. {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
  1096. {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
  1097. {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
  1098. {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
  1099. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
  1100. if (const auto *Entry =
  1101. CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
  1102. return LT.first * Entry->Cost;
  1103. }
  1104. if (Kind == TTI::SK_Reverse) {
  1105. static const CostTblEntry NEONShuffleTbl[] = {
  1106. // Reverse shuffle cost one instruction if we are shuffling within a
  1107. // double word (vrev) or two if we shuffle a quad word (vrev, vext).
  1108. {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
  1109. {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
  1110. {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
  1111. {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
  1112. {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
  1113. {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
  1114. {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
  1115. {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
  1116. {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
  1117. {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
  1118. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
  1119. if (const auto *Entry =
  1120. CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
  1121. return LT.first * Entry->Cost;
  1122. }
  1123. if (Kind == TTI::SK_Select) {
  1124. static const CostTblEntry NEONSelShuffleTbl[] = {
  1125. // Select shuffle cost table for ARM. Cost is the number of
  1126. // instructions
  1127. // required to create the shuffled vector.
  1128. {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
  1129. {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
  1130. {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
  1131. {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
  1132. {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
  1133. {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
  1134. {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
  1135. {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
  1136. {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
  1137. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
  1138. if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
  1139. ISD::VECTOR_SHUFFLE, LT.second))
  1140. return LT.first * Entry->Cost;
  1141. }
  1142. }
  1143. if (ST->hasMVEIntegerOps()) {
  1144. if (Kind == TTI::SK_Broadcast) {
  1145. static const CostTblEntry MVEDupTbl[] = {
  1146. // VDUP handles these cases.
  1147. {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
  1148. {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
  1149. {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
  1150. {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
  1151. {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
  1152. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
  1153. if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
  1154. LT.second))
  1155. return LT.first * Entry->Cost *
  1156. ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput);
  1157. }
  1158. if (!Mask.empty()) {
  1159. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
  1160. if (LT.second.isVector() &&
  1161. Mask.size() <= LT.second.getVectorNumElements() &&
  1162. (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
  1163. isVREVMask(Mask, LT.second, 64)))
  1164. return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
  1165. }
  1166. }
  1167. int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
  1168. ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
  1169. : 1;
  1170. return BaseCost *
  1171. BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
  1172. }
  1173. InstructionCost ARMTTIImpl::getArithmeticInstrCost(
  1174. unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
  1175. TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
  1176. ArrayRef<const Value *> Args,
  1177. const Instruction *CxtI) {
  1178. int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
  1179. if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
  1180. // Make operations on i1 relatively expensive as this often involves
  1181. // combining predicates. AND and XOR should be easier to handle with IT
  1182. // blocks.
  1183. switch (ISDOpcode) {
  1184. default:
  1185. break;
  1186. case ISD::AND:
  1187. case ISD::XOR:
  1188. return 2;
  1189. case ISD::OR:
  1190. return 3;
  1191. }
  1192. }
  1193. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
  1194. if (ST->hasNEON()) {
  1195. const unsigned FunctionCallDivCost = 20;
  1196. const unsigned ReciprocalDivCost = 10;
  1197. static const CostTblEntry CostTbl[] = {
  1198. // Division.
  1199. // These costs are somewhat random. Choose a cost of 20 to indicate that
  1200. // vectorizing devision (added function call) is going to be very expensive.
  1201. // Double registers types.
  1202. { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
  1203. { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
  1204. { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
  1205. { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
  1206. { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
  1207. { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
  1208. { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
  1209. { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
  1210. { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
  1211. { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
  1212. { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
  1213. { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
  1214. { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
  1215. { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
  1216. { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
  1217. { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
  1218. // Quad register types.
  1219. { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
  1220. { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
  1221. { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
  1222. { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
  1223. { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
  1224. { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
  1225. { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
  1226. { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
  1227. { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
  1228. { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
  1229. { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
  1230. { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
  1231. { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
  1232. { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
  1233. { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
  1234. { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
  1235. // Multiplication.
  1236. };
  1237. if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
  1238. return LT.first * Entry->Cost;
  1239. InstructionCost Cost = BaseT::getArithmeticInstrCost(
  1240. Opcode, Ty, CostKind, Op1Info, Op2Info);
  1241. // This is somewhat of a hack. The problem that we are facing is that SROA
  1242. // creates a sequence of shift, and, or instructions to construct values.
  1243. // These sequences are recognized by the ISel and have zero-cost. Not so for
  1244. // the vectorized code. Because we have support for v2i64 but not i64 those
  1245. // sequences look particularly beneficial to vectorize.
  1246. // To work around this we increase the cost of v2i64 operations to make them
  1247. // seem less beneficial.
  1248. if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
  1249. Cost += 4;
  1250. return Cost;
  1251. }
  1252. // If this operation is a shift on arm/thumb2, it might well be folded into
  1253. // the following instruction, hence having a cost of 0.
  1254. auto LooksLikeAFreeShift = [&]() {
  1255. if (ST->isThumb1Only() || Ty->isVectorTy())
  1256. return false;
  1257. if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
  1258. return false;
  1259. if (!Op2Info.isUniform() || !Op2Info.isConstant())
  1260. return false;
  1261. // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
  1262. switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
  1263. case Instruction::Add:
  1264. case Instruction::Sub:
  1265. case Instruction::And:
  1266. case Instruction::Xor:
  1267. case Instruction::Or:
  1268. case Instruction::ICmp:
  1269. return true;
  1270. default:
  1271. return false;
  1272. }
  1273. };
  1274. if (LooksLikeAFreeShift())
  1275. return 0;
  1276. // Default to cheap (throughput/size of 1 instruction) but adjust throughput
  1277. // for "multiple beats" potentially needed by MVE instructions.
  1278. int BaseCost = 1;
  1279. if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
  1280. BaseCost = ST->getMVEVectorCostFactor(CostKind);
  1281. // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
  1282. // without treating floats as more expensive that scalars or increasing the
  1283. // costs for custom operations. The results is also multiplied by the
  1284. // MVEVectorCostFactor where appropriate.
  1285. if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
  1286. return LT.first * BaseCost;
  1287. // Else this is expand, assume that we need to scalarize this op.
  1288. if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
  1289. unsigned Num = VTy->getNumElements();
  1290. InstructionCost Cost =
  1291. getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
  1292. // Return the cost of multiple scalar invocation plus the cost of
  1293. // inserting and extracting the values.
  1294. SmallVector<Type *> Tys(Args.size(), Ty);
  1295. return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) +
  1296. Num * Cost;
  1297. }
  1298. return BaseCost;
  1299. }
  1300. InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
  1301. MaybeAlign Alignment,
  1302. unsigned AddressSpace,
  1303. TTI::TargetCostKind CostKind,
  1304. TTI::OperandValueInfo OpInfo,
  1305. const Instruction *I) {
  1306. // TODO: Handle other cost kinds.
  1307. if (CostKind != TTI::TCK_RecipThroughput)
  1308. return 1;
  1309. // Type legalization can't handle structs
  1310. if (TLI->getValueType(DL, Src, true) == MVT::Other)
  1311. return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
  1312. CostKind);
  1313. if (ST->hasNEON() && Src->isVectorTy() &&
  1314. (Alignment && *Alignment != Align(16)) &&
  1315. cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
  1316. // Unaligned loads/stores are extremely inefficient.
  1317. // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
  1318. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
  1319. return LT.first * 4;
  1320. }
  1321. // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
  1322. // Same for stores.
  1323. if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
  1324. ((Opcode == Instruction::Load && I->hasOneUse() &&
  1325. isa<FPExtInst>(*I->user_begin())) ||
  1326. (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
  1327. FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
  1328. Type *DstTy =
  1329. Opcode == Instruction::Load
  1330. ? (*I->user_begin())->getType()
  1331. : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
  1332. if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
  1333. DstTy->getScalarType()->isFloatTy())
  1334. return ST->getMVEVectorCostFactor(CostKind);
  1335. }
  1336. int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
  1337. ? ST->getMVEVectorCostFactor(CostKind)
  1338. : 1;
  1339. return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
  1340. CostKind, OpInfo, I);
  1341. }
  1342. InstructionCost
  1343. ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
  1344. unsigned AddressSpace,
  1345. TTI::TargetCostKind CostKind) {
  1346. if (ST->hasMVEIntegerOps()) {
  1347. if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
  1348. return ST->getMVEVectorCostFactor(CostKind);
  1349. if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
  1350. return ST->getMVEVectorCostFactor(CostKind);
  1351. }
  1352. if (!isa<FixedVectorType>(Src))
  1353. return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
  1354. CostKind);
  1355. // Scalar cost, which is currently very high due to the efficiency of the
  1356. // generated code.
  1357. return cast<FixedVectorType>(Src)->getNumElements() * 8;
  1358. }
  1359. InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
  1360. unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
  1361. Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
  1362. bool UseMaskForCond, bool UseMaskForGaps) {
  1363. assert(Factor >= 2 && "Invalid interleave factor");
  1364. assert(isa<VectorType>(VecTy) && "Expect a vector type");
  1365. // vldN/vstN doesn't support vector types of i64/f64 element.
  1366. bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
  1367. if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
  1368. !UseMaskForCond && !UseMaskForGaps) {
  1369. unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
  1370. auto *SubVecTy =
  1371. FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
  1372. // vldN/vstN only support legal vector types of size 64 or 128 in bits.
  1373. // Accesses having vector types that are a multiple of 128 bits can be
  1374. // matched to more than one vldN/vstN instruction.
  1375. int BaseCost =
  1376. ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
  1377. if (NumElts % Factor == 0 &&
  1378. TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
  1379. return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
  1380. // Some smaller than legal interleaved patterns are cheap as we can make
  1381. // use of the vmovn or vrev patterns to interleave a standard load. This is
  1382. // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
  1383. // promoted differently). The cost of 2 here is then a load and vrev or
  1384. // vmovn.
  1385. if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
  1386. VecTy->isIntOrIntVectorTy() &&
  1387. DL.getTypeSizeInBits(SubVecTy).getFixedValue() <= 64)
  1388. return 2 * BaseCost;
  1389. }
  1390. return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
  1391. Alignment, AddressSpace, CostKind,
  1392. UseMaskForCond, UseMaskForGaps);
  1393. }
  1394. InstructionCost ARMTTIImpl::getGatherScatterOpCost(
  1395. unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
  1396. Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
  1397. using namespace PatternMatch;
  1398. if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
  1399. return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
  1400. Alignment, CostKind, I);
  1401. assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
  1402. auto *VTy = cast<FixedVectorType>(DataTy);
  1403. // TODO: Splitting, once we do that.
  1404. unsigned NumElems = VTy->getNumElements();
  1405. unsigned EltSize = VTy->getScalarSizeInBits();
  1406. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(DataTy);
  1407. // For now, it is assumed that for the MVE gather instructions the loads are
  1408. // all effectively serialised. This means the cost is the scalar cost
  1409. // multiplied by the number of elements being loaded. This is possibly very
  1410. // conservative, but even so we still end up vectorising loops because the
  1411. // cost per iteration for many loops is lower than for scalar loops.
  1412. InstructionCost VectorCost =
  1413. NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
  1414. // The scalarization cost should be a lot higher. We use the number of vector
  1415. // elements plus the scalarization overhead.
  1416. InstructionCost ScalarCost =
  1417. NumElems * LT.first +
  1418. BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false,
  1419. CostKind) +
  1420. BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true,
  1421. CostKind);
  1422. if (EltSize < 8 || Alignment < EltSize / 8)
  1423. return ScalarCost;
  1424. unsigned ExtSize = EltSize;
  1425. // Check whether there's a single user that asks for an extended type
  1426. if (I != nullptr) {
  1427. // Dependent of the caller of this function, a gather instruction will
  1428. // either have opcode Instruction::Load or be a call to the masked_gather
  1429. // intrinsic
  1430. if ((I->getOpcode() == Instruction::Load ||
  1431. match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
  1432. I->hasOneUse()) {
  1433. const User *Us = *I->users().begin();
  1434. if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
  1435. // only allow valid type combinations
  1436. unsigned TypeSize =
  1437. cast<Instruction>(Us)->getType()->getScalarSizeInBits();
  1438. if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
  1439. (TypeSize == 16 && EltSize == 8)) &&
  1440. TypeSize * NumElems == 128) {
  1441. ExtSize = TypeSize;
  1442. }
  1443. }
  1444. }
  1445. // Check whether the input data needs to be truncated
  1446. TruncInst *T;
  1447. if ((I->getOpcode() == Instruction::Store ||
  1448. match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
  1449. (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
  1450. // Only allow valid type combinations
  1451. unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
  1452. if (((EltSize == 16 && TypeSize == 32) ||
  1453. (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
  1454. TypeSize * NumElems == 128)
  1455. ExtSize = TypeSize;
  1456. }
  1457. }
  1458. if (ExtSize * NumElems != 128 || NumElems < 4)
  1459. return ScalarCost;
  1460. // Any (aligned) i32 gather will not need to be scalarised.
  1461. if (ExtSize == 32)
  1462. return VectorCost;
  1463. // For smaller types, we need to ensure that the gep's inputs are correctly
  1464. // extended from a small enough value. Other sizes (including i64) are
  1465. // scalarized for now.
  1466. if (ExtSize != 8 && ExtSize != 16)
  1467. return ScalarCost;
  1468. if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
  1469. Ptr = BC->getOperand(0);
  1470. if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
  1471. if (GEP->getNumOperands() != 2)
  1472. return ScalarCost;
  1473. unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
  1474. // Scale needs to be correct (which is only relevant for i16s).
  1475. if (Scale != 1 && Scale * 8 != ExtSize)
  1476. return ScalarCost;
  1477. // And we need to zext (not sext) the indexes from a small enough type.
  1478. if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
  1479. if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
  1480. return VectorCost;
  1481. }
  1482. return ScalarCost;
  1483. }
  1484. return ScalarCost;
  1485. }
  1486. InstructionCost
  1487. ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
  1488. std::optional<FastMathFlags> FMF,
  1489. TTI::TargetCostKind CostKind) {
  1490. if (TTI::requiresOrderedReduction(FMF))
  1491. return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
  1492. EVT ValVT = TLI->getValueType(DL, ValTy);
  1493. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  1494. if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
  1495. return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
  1496. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
  1497. static const CostTblEntry CostTblAdd[]{
  1498. {ISD::ADD, MVT::v16i8, 1},
  1499. {ISD::ADD, MVT::v8i16, 1},
  1500. {ISD::ADD, MVT::v4i32, 1},
  1501. };
  1502. if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
  1503. return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
  1504. return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
  1505. }
  1506. InstructionCost ARMTTIImpl::getExtendedReductionCost(
  1507. unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
  1508. std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) {
  1509. EVT ValVT = TLI->getValueType(DL, ValTy);
  1510. EVT ResVT = TLI->getValueType(DL, ResTy);
  1511. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  1512. switch (ISD) {
  1513. case ISD::ADD:
  1514. if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
  1515. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
  1516. // The legal cases are:
  1517. // VADDV u/s 8/16/32
  1518. // VADDLV u/s 32
  1519. // Codegen currently cannot always handle larger than legal vectors very
  1520. // well, especially for predicated reductions where the mask needs to be
  1521. // split, so restrict to 128bit or smaller input types.
  1522. unsigned RevVTSize = ResVT.getSizeInBits();
  1523. if (ValVT.getSizeInBits() <= 128 &&
  1524. ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
  1525. (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
  1526. (LT.second == MVT::v4i32 && RevVTSize <= 64)))
  1527. return ST->getMVEVectorCostFactor(CostKind) * LT.first;
  1528. }
  1529. break;
  1530. default:
  1531. break;
  1532. }
  1533. return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, FMF,
  1534. CostKind);
  1535. }
  1536. InstructionCost
  1537. ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
  1538. VectorType *ValTy,
  1539. TTI::TargetCostKind CostKind) {
  1540. EVT ValVT = TLI->getValueType(DL, ValTy);
  1541. EVT ResVT = TLI->getValueType(DL, ResTy);
  1542. if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
  1543. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
  1544. // The legal cases are:
  1545. // VMLAV u/s 8/16/32
  1546. // VMLALV u/s 16/32
  1547. // Codegen currently cannot always handle larger than legal vectors very
  1548. // well, especially for predicated reductions where the mask needs to be
  1549. // split, so restrict to 128bit or smaller input types.
  1550. unsigned RevVTSize = ResVT.getSizeInBits();
  1551. if (ValVT.getSizeInBits() <= 128 &&
  1552. ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
  1553. (LT.second == MVT::v8i16 && RevVTSize <= 64) ||
  1554. (LT.second == MVT::v4i32 && RevVTSize <= 64)))
  1555. return ST->getMVEVectorCostFactor(CostKind) * LT.first;
  1556. }
  1557. return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, ValTy, CostKind);
  1558. }
  1559. InstructionCost
  1560. ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
  1561. TTI::TargetCostKind CostKind) {
  1562. switch (ICA.getID()) {
  1563. case Intrinsic::get_active_lane_mask:
  1564. // Currently we make a somewhat optimistic assumption that
  1565. // active_lane_mask's are always free. In reality it may be freely folded
  1566. // into a tail predicated loop, expanded into a VCPT or expanded into a lot
  1567. // of add/icmp code. We may need to improve this in the future, but being
  1568. // able to detect if it is free or not involves looking at a lot of other
  1569. // code. We currently assume that the vectorizer inserted these, and knew
  1570. // what it was doing in adding one.
  1571. if (ST->hasMVEIntegerOps())
  1572. return 0;
  1573. break;
  1574. case Intrinsic::sadd_sat:
  1575. case Intrinsic::ssub_sat:
  1576. case Intrinsic::uadd_sat:
  1577. case Intrinsic::usub_sat: {
  1578. if (!ST->hasMVEIntegerOps())
  1579. break;
  1580. Type *VT = ICA.getReturnType();
  1581. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
  1582. if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
  1583. LT.second == MVT::v16i8) {
  1584. // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
  1585. // need to extend the type, as it uses shr(qadd(shl, shl)).
  1586. unsigned Instrs =
  1587. LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
  1588. return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
  1589. }
  1590. break;
  1591. }
  1592. case Intrinsic::abs:
  1593. case Intrinsic::smin:
  1594. case Intrinsic::smax:
  1595. case Intrinsic::umin:
  1596. case Intrinsic::umax: {
  1597. if (!ST->hasMVEIntegerOps())
  1598. break;
  1599. Type *VT = ICA.getReturnType();
  1600. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
  1601. if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
  1602. LT.second == MVT::v16i8)
  1603. return LT.first * ST->getMVEVectorCostFactor(CostKind);
  1604. break;
  1605. }
  1606. case Intrinsic::minnum:
  1607. case Intrinsic::maxnum: {
  1608. if (!ST->hasMVEFloatOps())
  1609. break;
  1610. Type *VT = ICA.getReturnType();
  1611. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
  1612. if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
  1613. return LT.first * ST->getMVEVectorCostFactor(CostKind);
  1614. break;
  1615. }
  1616. case Intrinsic::fptosi_sat:
  1617. case Intrinsic::fptoui_sat: {
  1618. if (ICA.getArgTypes().empty())
  1619. break;
  1620. bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
  1621. auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
  1622. EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
  1623. // Check for the legal types, with the corect subtarget features.
  1624. if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
  1625. (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
  1626. (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
  1627. return LT.first;
  1628. // Equally for MVE vector types
  1629. if (ST->hasMVEFloatOps() &&
  1630. (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
  1631. LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
  1632. return LT.first * ST->getMVEVectorCostFactor(CostKind);
  1633. // Otherwise we use a legal convert followed by a min+max
  1634. if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
  1635. (ST->hasFP64() && LT.second == MVT::f64) ||
  1636. (ST->hasFullFP16() && LT.second == MVT::f16) ||
  1637. (ST->hasMVEFloatOps() &&
  1638. (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
  1639. LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
  1640. Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
  1641. LT.second.getScalarSizeInBits());
  1642. InstructionCost Cost =
  1643. LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
  1644. IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
  1645. : Intrinsic::umin,
  1646. LegalTy, {LegalTy, LegalTy});
  1647. Cost += getIntrinsicInstrCost(Attrs1, CostKind);
  1648. IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
  1649. : Intrinsic::umax,
  1650. LegalTy, {LegalTy, LegalTy});
  1651. Cost += getIntrinsicInstrCost(Attrs2, CostKind);
  1652. return LT.first * Cost;
  1653. }
  1654. break;
  1655. }
  1656. }
  1657. return BaseT::getIntrinsicInstrCost(ICA, CostKind);
  1658. }
  1659. bool ARMTTIImpl::isLoweredToCall(const Function *F) {
  1660. if (!F->isIntrinsic())
  1661. return BaseT::isLoweredToCall(F);
  1662. // Assume all Arm-specific intrinsics map to an instruction.
  1663. if (F->getName().startswith("llvm.arm"))
  1664. return false;
  1665. switch (F->getIntrinsicID()) {
  1666. default: break;
  1667. case Intrinsic::powi:
  1668. case Intrinsic::sin:
  1669. case Intrinsic::cos:
  1670. case Intrinsic::pow:
  1671. case Intrinsic::log:
  1672. case Intrinsic::log10:
  1673. case Intrinsic::log2:
  1674. case Intrinsic::exp:
  1675. case Intrinsic::exp2:
  1676. return true;
  1677. case Intrinsic::sqrt:
  1678. case Intrinsic::fabs:
  1679. case Intrinsic::copysign:
  1680. case Intrinsic::floor:
  1681. case Intrinsic::ceil:
  1682. case Intrinsic::trunc:
  1683. case Intrinsic::rint:
  1684. case Intrinsic::nearbyint:
  1685. case Intrinsic::round:
  1686. case Intrinsic::canonicalize:
  1687. case Intrinsic::lround:
  1688. case Intrinsic::llround:
  1689. case Intrinsic::lrint:
  1690. case Intrinsic::llrint:
  1691. if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
  1692. return true;
  1693. if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
  1694. return true;
  1695. // Some operations can be handled by vector instructions and assume
  1696. // unsupported vectors will be expanded into supported scalar ones.
  1697. // TODO Handle scalar operations properly.
  1698. return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
  1699. case Intrinsic::masked_store:
  1700. case Intrinsic::masked_load:
  1701. case Intrinsic::masked_gather:
  1702. case Intrinsic::masked_scatter:
  1703. return !ST->hasMVEIntegerOps();
  1704. case Intrinsic::sadd_with_overflow:
  1705. case Intrinsic::uadd_with_overflow:
  1706. case Intrinsic::ssub_with_overflow:
  1707. case Intrinsic::usub_with_overflow:
  1708. case Intrinsic::sadd_sat:
  1709. case Intrinsic::uadd_sat:
  1710. case Intrinsic::ssub_sat:
  1711. case Intrinsic::usub_sat:
  1712. return false;
  1713. }
  1714. return BaseT::isLoweredToCall(F);
  1715. }
  1716. bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
  1717. unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
  1718. EVT VT = TLI->getValueType(DL, I.getType(), true);
  1719. if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
  1720. return true;
  1721. // Check if an intrinsic will be lowered to a call and assume that any
  1722. // other CallInst will generate a bl.
  1723. if (auto *Call = dyn_cast<CallInst>(&I)) {
  1724. if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
  1725. switch(II->getIntrinsicID()) {
  1726. case Intrinsic::memcpy:
  1727. case Intrinsic::memset:
  1728. case Intrinsic::memmove:
  1729. return getNumMemOps(II) == -1;
  1730. default:
  1731. if (const Function *F = Call->getCalledFunction())
  1732. return isLoweredToCall(F);
  1733. }
  1734. }
  1735. return true;
  1736. }
  1737. // FPv5 provides conversions between integer, double-precision,
  1738. // single-precision, and half-precision formats.
  1739. switch (I.getOpcode()) {
  1740. default:
  1741. break;
  1742. case Instruction::FPToSI:
  1743. case Instruction::FPToUI:
  1744. case Instruction::SIToFP:
  1745. case Instruction::UIToFP:
  1746. case Instruction::FPTrunc:
  1747. case Instruction::FPExt:
  1748. return !ST->hasFPARMv8Base();
  1749. }
  1750. // FIXME: Unfortunately the approach of checking the Operation Action does
  1751. // not catch all cases of Legalization that use library calls. Our
  1752. // Legalization step categorizes some transformations into library calls as
  1753. // Custom, Expand or even Legal when doing type legalization. So for now
  1754. // we have to special case for instance the SDIV of 64bit integers and the
  1755. // use of floating point emulation.
  1756. if (VT.isInteger() && VT.getSizeInBits() >= 64) {
  1757. switch (ISD) {
  1758. default:
  1759. break;
  1760. case ISD::SDIV:
  1761. case ISD::UDIV:
  1762. case ISD::SREM:
  1763. case ISD::UREM:
  1764. case ISD::SDIVREM:
  1765. case ISD::UDIVREM:
  1766. return true;
  1767. }
  1768. }
  1769. // Assume all other non-float operations are supported.
  1770. if (!VT.isFloatingPoint())
  1771. return false;
  1772. // We'll need a library call to handle most floats when using soft.
  1773. if (TLI->useSoftFloat()) {
  1774. switch (I.getOpcode()) {
  1775. default:
  1776. return true;
  1777. case Instruction::Alloca:
  1778. case Instruction::Load:
  1779. case Instruction::Store:
  1780. case Instruction::Select:
  1781. case Instruction::PHI:
  1782. return false;
  1783. }
  1784. }
  1785. // We'll need a libcall to perform double precision operations on a single
  1786. // precision only FPU.
  1787. if (I.getType()->isDoubleTy() && !ST->hasFP64())
  1788. return true;
  1789. // Likewise for half precision arithmetic.
  1790. if (I.getType()->isHalfTy() && !ST->hasFullFP16())
  1791. return true;
  1792. return false;
  1793. }
  1794. bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
  1795. AssumptionCache &AC,
  1796. TargetLibraryInfo *LibInfo,
  1797. HardwareLoopInfo &HWLoopInfo) {
  1798. // Low-overhead branches are only supported in the 'low-overhead branch'
  1799. // extension of v8.1-m.
  1800. if (!ST->hasLOB() || DisableLowOverheadLoops) {
  1801. LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
  1802. return false;
  1803. }
  1804. if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
  1805. LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
  1806. return false;
  1807. }
  1808. const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
  1809. if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
  1810. LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
  1811. return false;
  1812. }
  1813. const SCEV *TripCountSCEV =
  1814. SE.getAddExpr(BackedgeTakenCount,
  1815. SE.getOne(BackedgeTakenCount->getType()));
  1816. // We need to store the trip count in LR, a 32-bit register.
  1817. if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
  1818. LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
  1819. return false;
  1820. }
  1821. // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
  1822. // point in generating a hardware loop if that's going to happen.
  1823. auto IsHardwareLoopIntrinsic = [](Instruction &I) {
  1824. if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
  1825. switch (Call->getIntrinsicID()) {
  1826. default:
  1827. break;
  1828. case Intrinsic::start_loop_iterations:
  1829. case Intrinsic::test_start_loop_iterations:
  1830. case Intrinsic::loop_decrement:
  1831. case Intrinsic::loop_decrement_reg:
  1832. return true;
  1833. }
  1834. }
  1835. return false;
  1836. };
  1837. // Scan the instructions to see if there's any that we know will turn into a
  1838. // call or if this loop is already a low-overhead loop or will become a tail
  1839. // predicated loop.
  1840. bool IsTailPredLoop = false;
  1841. auto ScanLoop = [&](Loop *L) {
  1842. for (auto *BB : L->getBlocks()) {
  1843. for (auto &I : *BB) {
  1844. if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
  1845. isa<InlineAsm>(I)) {
  1846. LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
  1847. return false;
  1848. }
  1849. if (auto *II = dyn_cast<IntrinsicInst>(&I))
  1850. IsTailPredLoop |=
  1851. II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
  1852. II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
  1853. II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
  1854. II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
  1855. II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
  1856. }
  1857. }
  1858. return true;
  1859. };
  1860. // Visit inner loops.
  1861. for (auto *Inner : *L)
  1862. if (!ScanLoop(Inner))
  1863. return false;
  1864. if (!ScanLoop(L))
  1865. return false;
  1866. // TODO: Check whether the trip count calculation is expensive. If L is the
  1867. // inner loop but we know it has a low trip count, calculating that trip
  1868. // count (in the parent loop) may be detrimental.
  1869. LLVMContext &C = L->getHeader()->getContext();
  1870. HWLoopInfo.CounterInReg = true;
  1871. HWLoopInfo.IsNestingLegal = false;
  1872. HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
  1873. HWLoopInfo.CountType = Type::getInt32Ty(C);
  1874. HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
  1875. return true;
  1876. }
  1877. static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
  1878. // We don't allow icmp's, and because we only look at single block loops,
  1879. // we simply count the icmps, i.e. there should only be 1 for the backedge.
  1880. if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
  1881. return false;
  1882. // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
  1883. // not currently canonical, but soon will be. Code without them uses icmp, and
  1884. // so is not tail predicated as per the condition above. In order to get the
  1885. // same performance we treat min and max the same as an icmp for tailpred
  1886. // purposes for the moment (we often rely on non-tailpred and higher VF's to
  1887. // pick more optimial instructions like VQDMULH. They need to be recognized
  1888. // directly by the vectorizer).
  1889. if (auto *II = dyn_cast<IntrinsicInst>(&I))
  1890. if ((II->getIntrinsicID() == Intrinsic::smin ||
  1891. II->getIntrinsicID() == Intrinsic::smax ||
  1892. II->getIntrinsicID() == Intrinsic::umin ||
  1893. II->getIntrinsicID() == Intrinsic::umax) &&
  1894. ++ICmpCount > 1)
  1895. return false;
  1896. if (isa<FCmpInst>(&I))
  1897. return false;
  1898. // We could allow extending/narrowing FP loads/stores, but codegen is
  1899. // too inefficient so reject this for now.
  1900. if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
  1901. return false;
  1902. // Extends have to be extending-loads
  1903. if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
  1904. if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
  1905. return false;
  1906. // Truncs have to be narrowing-stores
  1907. if (isa<TruncInst>(&I) )
  1908. if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
  1909. return false;
  1910. return true;
  1911. }
  1912. // To set up a tail-predicated loop, we need to know the total number of
  1913. // elements processed by that loop. Thus, we need to determine the element
  1914. // size and:
  1915. // 1) it should be uniform for all operations in the vector loop, so we
  1916. // e.g. don't want any widening/narrowing operations.
  1917. // 2) it should be smaller than i64s because we don't have vector operations
  1918. // that work on i64s.
  1919. // 3) we don't want elements to be reversed or shuffled, to make sure the
  1920. // tail-predication masks/predicates the right lanes.
  1921. //
  1922. static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
  1923. const DataLayout &DL,
  1924. const LoopAccessInfo *LAI) {
  1925. LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
  1926. // If there are live-out values, it is probably a reduction. We can predicate
  1927. // most reduction operations freely under MVE using a combination of
  1928. // prefer-predicated-reduction-select and inloop reductions. We limit this to
  1929. // floating point and integer reductions, but don't check for operators
  1930. // specifically here. If the value ends up not being a reduction (and so the
  1931. // vectorizer cannot tailfold the loop), we should fall back to standard
  1932. // vectorization automatically.
  1933. SmallVector< Instruction *, 8 > LiveOuts;
  1934. LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
  1935. bool ReductionsDisabled =
  1936. EnableTailPredication == TailPredication::EnabledNoReductions ||
  1937. EnableTailPredication == TailPredication::ForceEnabledNoReductions;
  1938. for (auto *I : LiveOuts) {
  1939. if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
  1940. !I->getType()->isHalfTy()) {
  1941. LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
  1942. "live-out value\n");
  1943. return false;
  1944. }
  1945. if (ReductionsDisabled) {
  1946. LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
  1947. return false;
  1948. }
  1949. }
  1950. // Next, check that all instructions can be tail-predicated.
  1951. PredicatedScalarEvolution PSE = LAI->getPSE();
  1952. SmallVector<Instruction *, 16> LoadStores;
  1953. int ICmpCount = 0;
  1954. for (BasicBlock *BB : L->blocks()) {
  1955. for (Instruction &I : BB->instructionsWithoutDebug()) {
  1956. if (isa<PHINode>(&I))
  1957. continue;
  1958. if (!canTailPredicateInstruction(I, ICmpCount)) {
  1959. LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
  1960. return false;
  1961. }
  1962. Type *T = I.getType();
  1963. if (T->getScalarSizeInBits() > 32) {
  1964. LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
  1965. return false;
  1966. }
  1967. if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
  1968. Value *Ptr = getLoadStorePointerOperand(&I);
  1969. Type *AccessTy = getLoadStoreType(&I);
  1970. int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0);
  1971. if (NextStride == 1) {
  1972. // TODO: for now only allow consecutive strides of 1. We could support
  1973. // other strides as long as it is uniform, but let's keep it simple
  1974. // for now.
  1975. continue;
  1976. } else if (NextStride == -1 ||
  1977. (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
  1978. (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
  1979. LLVM_DEBUG(dbgs()
  1980. << "Consecutive strides of 2 found, vld2/vstr2 can't "
  1981. "be tail-predicated\n.");
  1982. return false;
  1983. // TODO: don't tail predicate if there is a reversed load?
  1984. } else if (EnableMaskedGatherScatters) {
  1985. // Gather/scatters do allow loading from arbitrary strides, at
  1986. // least if they are loop invariant.
  1987. // TODO: Loop variant strides should in theory work, too, but
  1988. // this requires further testing.
  1989. const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
  1990. if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
  1991. const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
  1992. if (PSE.getSE()->isLoopInvariant(Step, L))
  1993. continue;
  1994. }
  1995. }
  1996. LLVM_DEBUG(dbgs() << "Bad stride found, can't "
  1997. "tail-predicate\n.");
  1998. return false;
  1999. }
  2000. }
  2001. }
  2002. LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
  2003. return true;
  2004. }
  2005. bool ARMTTIImpl::preferPredicateOverEpilogue(
  2006. Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
  2007. TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL,
  2008. InterleavedAccessInfo *IAI) {
  2009. if (!EnableTailPredication) {
  2010. LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
  2011. return false;
  2012. }
  2013. // Creating a predicated vector loop is the first step for generating a
  2014. // tail-predicated hardware loop, for which we need the MVE masked
  2015. // load/stores instructions:
  2016. if (!ST->hasMVEIntegerOps())
  2017. return false;
  2018. // For now, restrict this to single block loops.
  2019. if (L->getNumBlocks() > 1) {
  2020. LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
  2021. "loop.\n");
  2022. return false;
  2023. }
  2024. assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
  2025. HardwareLoopInfo HWLoopInfo(L);
  2026. if (!HWLoopInfo.canAnalyze(*LI)) {
  2027. LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
  2028. "analyzable.\n");
  2029. return false;
  2030. }
  2031. // This checks if we have the low-overhead branch architecture
  2032. // extension, and if we will create a hardware-loop:
  2033. if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
  2034. LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
  2035. "profitable.\n");
  2036. return false;
  2037. }
  2038. if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
  2039. LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
  2040. "a candidate.\n");
  2041. return false;
  2042. }
  2043. return canTailPredicateLoop(L, LI, SE, DL, LVL->getLAI());
  2044. }
  2045. PredicationStyle ARMTTIImpl::emitGetActiveLaneMask() const {
  2046. if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
  2047. return PredicationStyle::None;
  2048. // Intrinsic @llvm.get.active.lane.mask is supported.
  2049. // It is used in the MVETailPredication pass, which requires the number of
  2050. // elements processed by this vector loop to setup the tail-predicated
  2051. // loop.
  2052. return PredicationStyle::Data;
  2053. }
  2054. void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
  2055. TTI::UnrollingPreferences &UP,
  2056. OptimizationRemarkEmitter *ORE) {
  2057. // Enable Upper bound unrolling universally, not dependant upon the conditions
  2058. // below.
  2059. UP.UpperBound = true;
  2060. // Only currently enable these preferences for M-Class cores.
  2061. if (!ST->isMClass())
  2062. return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
  2063. // Disable loop unrolling for Oz and Os.
  2064. UP.OptSizeThreshold = 0;
  2065. UP.PartialOptSizeThreshold = 0;
  2066. if (L->getHeader()->getParent()->hasOptSize())
  2067. return;
  2068. SmallVector<BasicBlock*, 4> ExitingBlocks;
  2069. L->getExitingBlocks(ExitingBlocks);
  2070. LLVM_DEBUG(dbgs() << "Loop has:\n"
  2071. << "Blocks: " << L->getNumBlocks() << "\n"
  2072. << "Exit blocks: " << ExitingBlocks.size() << "\n");
  2073. // Only allow another exit other than the latch. This acts as an early exit
  2074. // as it mirrors the profitability calculation of the runtime unroller.
  2075. if (ExitingBlocks.size() > 2)
  2076. return;
  2077. // Limit the CFG of the loop body for targets with a branch predictor.
  2078. // Allowing 4 blocks permits if-then-else diamonds in the body.
  2079. if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
  2080. return;
  2081. // Don't unroll vectorized loops, including the remainder loop
  2082. if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
  2083. return;
  2084. // Scan the loop: don't unroll loops with calls as this could prevent
  2085. // inlining.
  2086. InstructionCost Cost = 0;
  2087. for (auto *BB : L->getBlocks()) {
  2088. for (auto &I : *BB) {
  2089. // Don't unroll vectorised loop. MVE does not benefit from it as much as
  2090. // scalar code.
  2091. if (I.getType()->isVectorTy())
  2092. return;
  2093. if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
  2094. if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
  2095. if (!isLoweredToCall(F))
  2096. continue;
  2097. }
  2098. return;
  2099. }
  2100. SmallVector<const Value*, 4> Operands(I.operand_values());
  2101. Cost += getInstructionCost(&I, Operands,
  2102. TargetTransformInfo::TCK_SizeAndLatency);
  2103. }
  2104. }
  2105. // On v6m cores, there are very few registers available. We can easily end up
  2106. // spilling and reloading more registers in an unrolled loop. Look at the
  2107. // number of LCSSA phis as a rough measure of how many registers will need to
  2108. // be live out of the loop, reducing the default unroll count if more than 1
  2109. // value is needed. In the long run, all of this should be being learnt by a
  2110. // machine.
  2111. unsigned UnrollCount = 4;
  2112. if (ST->isThumb1Only()) {
  2113. unsigned ExitingValues = 0;
  2114. SmallVector<BasicBlock *, 4> ExitBlocks;
  2115. L->getExitBlocks(ExitBlocks);
  2116. for (auto *Exit : ExitBlocks) {
  2117. // Count the number of LCSSA phis. Exclude values coming from GEP's as
  2118. // only the last is expected to be needed for address operands.
  2119. unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
  2120. return PH.getNumOperands() != 1 ||
  2121. !isa<GetElementPtrInst>(PH.getOperand(0));
  2122. });
  2123. ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
  2124. }
  2125. if (ExitingValues)
  2126. UnrollCount /= ExitingValues;
  2127. if (UnrollCount <= 1)
  2128. return;
  2129. }
  2130. LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
  2131. LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
  2132. UP.Partial = true;
  2133. UP.Runtime = true;
  2134. UP.UnrollRemainder = true;
  2135. UP.DefaultUnrollRuntimeCount = UnrollCount;
  2136. UP.UnrollAndJam = true;
  2137. UP.UnrollAndJamInnerLoopThreshold = 60;
  2138. // Force unrolling small loops can be very useful because of the branch
  2139. // taken cost of the backedge.
  2140. if (Cost < 12)
  2141. UP.Force = true;
  2142. }
  2143. void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
  2144. TTI::PeelingPreferences &PP) {
  2145. BaseT::getPeelingPreferences(L, SE, PP);
  2146. }
  2147. bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
  2148. TTI::ReductionFlags Flags) const {
  2149. if (!ST->hasMVEIntegerOps())
  2150. return false;
  2151. unsigned ScalarBits = Ty->getScalarSizeInBits();
  2152. switch (Opcode) {
  2153. case Instruction::Add:
  2154. return ScalarBits <= 64;
  2155. default:
  2156. return false;
  2157. }
  2158. }
  2159. bool ARMTTIImpl::preferPredicatedReductionSelect(
  2160. unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
  2161. if (!ST->hasMVEIntegerOps())
  2162. return false;
  2163. return true;
  2164. }
  2165. InstructionCost ARMTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
  2166. int64_t BaseOffset,
  2167. bool HasBaseReg, int64_t Scale,
  2168. unsigned AddrSpace) const {
  2169. TargetLoweringBase::AddrMode AM;
  2170. AM.BaseGV = BaseGV;
  2171. AM.BaseOffs = BaseOffset;
  2172. AM.HasBaseReg = HasBaseReg;
  2173. AM.Scale = Scale;
  2174. if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) {
  2175. if (ST->hasFPAO())
  2176. return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
  2177. return 0;
  2178. }
  2179. return -1;
  2180. }