ARMTargetTransformInfo.cpp 91 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344
  1. //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #include "ARMTargetTransformInfo.h"
  9. #include "ARMSubtarget.h"
  10. #include "MCTargetDesc/ARMAddressingModes.h"
  11. #include "llvm/ADT/APInt.h"
  12. #include "llvm/ADT/SmallVector.h"
  13. #include "llvm/Analysis/LoopInfo.h"
  14. #include "llvm/CodeGen/CostTable.h"
  15. #include "llvm/CodeGen/ISDOpcodes.h"
  16. #include "llvm/CodeGen/ValueTypes.h"
  17. #include "llvm/IR/BasicBlock.h"
  18. #include "llvm/IR/DataLayout.h"
  19. #include "llvm/IR/DerivedTypes.h"
  20. #include "llvm/IR/Instruction.h"
  21. #include "llvm/IR/Instructions.h"
  22. #include "llvm/IR/Intrinsics.h"
  23. #include "llvm/IR/IntrinsicInst.h"
  24. #include "llvm/IR/IntrinsicsARM.h"
  25. #include "llvm/IR/PatternMatch.h"
  26. #include "llvm/IR/Type.h"
  27. #include "llvm/MC/SubtargetFeature.h"
  28. #include "llvm/Support/Casting.h"
  29. #include "llvm/Support/KnownBits.h"
  30. #include "llvm/Support/MachineValueType.h"
  31. #include "llvm/Target/TargetMachine.h"
  32. #include "llvm/Transforms/InstCombine/InstCombiner.h"
  33. #include "llvm/Transforms/Utils/Local.h"
  34. #include "llvm/Transforms/Utils/LoopUtils.h"
  35. #include <algorithm>
  36. #include <cassert>
  37. #include <cstdint>
  38. #include <utility>
  39. using namespace llvm;
  40. #define DEBUG_TYPE "armtti"
  41. static cl::opt<bool> EnableMaskedLoadStores(
  42. "enable-arm-maskedldst", cl::Hidden, cl::init(true),
  43. cl::desc("Enable the generation of masked loads and stores"));
  44. static cl::opt<bool> DisableLowOverheadLoops(
  45. "disable-arm-loloops", cl::Hidden, cl::init(false),
  46. cl::desc("Disable the generation of low-overhead loops"));
  47. static cl::opt<bool>
  48. AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
  49. cl::desc("Enable the generation of WLS loops"));
  50. extern cl::opt<TailPredication::Mode> EnableTailPredication;
  51. extern cl::opt<bool> EnableMaskedGatherScatters;
  52. extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
  53. /// Convert a vector load intrinsic into a simple llvm load instruction.
  54. /// This is beneficial when the underlying object being addressed comes
  55. /// from a constant, since we get constant-folding for free.
  56. static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
  57. InstCombiner::BuilderTy &Builder) {
  58. auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
  59. if (!IntrAlign)
  60. return nullptr;
  61. unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
  62. ? MemAlign
  63. : IntrAlign->getLimitedValue();
  64. if (!isPowerOf2_32(Alignment))
  65. return nullptr;
  66. auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
  67. PointerType::get(II.getType(), 0));
  68. return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
  69. }
  70. bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
  71. const Function *Callee) const {
  72. const TargetMachine &TM = getTLI()->getTargetMachine();
  73. const FeatureBitset &CallerBits =
  74. TM.getSubtargetImpl(*Caller)->getFeatureBits();
  75. const FeatureBitset &CalleeBits =
  76. TM.getSubtargetImpl(*Callee)->getFeatureBits();
  77. // To inline a callee, all features not in the allowed list must match exactly.
  78. bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
  79. (CalleeBits & ~InlineFeaturesAllowed);
  80. // For features in the allowed list, the callee's features must be a subset of
  81. // the callers'.
  82. bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
  83. (CalleeBits & InlineFeaturesAllowed);
  84. return MatchExact && MatchSubset;
  85. }
  86. TTI::AddressingModeKind
  87. ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
  88. ScalarEvolution *SE) const {
  89. if (ST->hasMVEIntegerOps())
  90. return TTI::AMK_PostIndexed;
  91. if (L->getHeader()->getParent()->hasOptSize())
  92. return TTI::AMK_None;
  93. if (ST->isMClass() && ST->isThumb2() &&
  94. L->getNumBlocks() == 1)
  95. return TTI::AMK_PreIndexed;
  96. return TTI::AMK_None;
  97. }
  98. Optional<Instruction *>
  99. ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
  100. using namespace PatternMatch;
  101. Intrinsic::ID IID = II.getIntrinsicID();
  102. switch (IID) {
  103. default:
  104. break;
  105. case Intrinsic::arm_neon_vld1: {
  106. Align MemAlign =
  107. getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
  108. &IC.getAssumptionCache(), &IC.getDominatorTree());
  109. if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
  110. return IC.replaceInstUsesWith(II, V);
  111. }
  112. break;
  113. }
  114. case Intrinsic::arm_neon_vld2:
  115. case Intrinsic::arm_neon_vld3:
  116. case Intrinsic::arm_neon_vld4:
  117. case Intrinsic::arm_neon_vld2lane:
  118. case Intrinsic::arm_neon_vld3lane:
  119. case Intrinsic::arm_neon_vld4lane:
  120. case Intrinsic::arm_neon_vst1:
  121. case Intrinsic::arm_neon_vst2:
  122. case Intrinsic::arm_neon_vst3:
  123. case Intrinsic::arm_neon_vst4:
  124. case Intrinsic::arm_neon_vst2lane:
  125. case Intrinsic::arm_neon_vst3lane:
  126. case Intrinsic::arm_neon_vst4lane: {
  127. Align MemAlign =
  128. getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
  129. &IC.getAssumptionCache(), &IC.getDominatorTree());
  130. unsigned AlignArg = II.arg_size() - 1;
  131. Value *AlignArgOp = II.getArgOperand(AlignArg);
  132. MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
  133. if (Align && *Align < MemAlign) {
  134. return IC.replaceOperand(
  135. II, AlignArg,
  136. ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
  137. false));
  138. }
  139. break;
  140. }
  141. case Intrinsic::arm_mve_pred_i2v: {
  142. Value *Arg = II.getArgOperand(0);
  143. Value *ArgArg;
  144. if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
  145. PatternMatch::m_Value(ArgArg))) &&
  146. II.getType() == ArgArg->getType()) {
  147. return IC.replaceInstUsesWith(II, ArgArg);
  148. }
  149. Constant *XorMask;
  150. if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
  151. PatternMatch::m_Value(ArgArg)),
  152. PatternMatch::m_Constant(XorMask))) &&
  153. II.getType() == ArgArg->getType()) {
  154. if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
  155. if (CI->getValue().trunc(16).isAllOnes()) {
  156. auto TrueVector = IC.Builder.CreateVectorSplat(
  157. cast<FixedVectorType>(II.getType())->getNumElements(),
  158. IC.Builder.getTrue());
  159. return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
  160. }
  161. }
  162. }
  163. KnownBits ScalarKnown(32);
  164. if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
  165. ScalarKnown, 0)) {
  166. return &II;
  167. }
  168. break;
  169. }
  170. case Intrinsic::arm_mve_pred_v2i: {
  171. Value *Arg = II.getArgOperand(0);
  172. Value *ArgArg;
  173. if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
  174. PatternMatch::m_Value(ArgArg)))) {
  175. return IC.replaceInstUsesWith(II, ArgArg);
  176. }
  177. if (!II.getMetadata(LLVMContext::MD_range)) {
  178. Type *IntTy32 = Type::getInt32Ty(II.getContext());
  179. Metadata *M[] = {
  180. ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
  181. ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
  182. II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
  183. return &II;
  184. }
  185. break;
  186. }
  187. case Intrinsic::arm_mve_vadc:
  188. case Intrinsic::arm_mve_vadc_predicated: {
  189. unsigned CarryOp =
  190. (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
  191. assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
  192. "Bad type for intrinsic!");
  193. KnownBits CarryKnown(32);
  194. if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
  195. CarryKnown)) {
  196. return &II;
  197. }
  198. break;
  199. }
  200. case Intrinsic::arm_mve_vmldava: {
  201. Instruction *I = cast<Instruction>(&II);
  202. if (I->hasOneUse()) {
  203. auto *User = cast<Instruction>(*I->user_begin());
  204. Value *OpZ;
  205. if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
  206. match(I->getOperand(3), m_Zero())) {
  207. Value *OpX = I->getOperand(4);
  208. Value *OpY = I->getOperand(5);
  209. Type *OpTy = OpX->getType();
  210. IC.Builder.SetInsertPoint(User);
  211. Value *V =
  212. IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
  213. {I->getOperand(0), I->getOperand(1),
  214. I->getOperand(2), OpZ, OpX, OpY});
  215. IC.replaceInstUsesWith(*User, V);
  216. return IC.eraseInstFromFunction(*User);
  217. }
  218. }
  219. return None;
  220. }
  221. }
  222. return None;
  223. }
  224. Optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic(
  225. InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
  226. APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
  227. std::function<void(Instruction *, unsigned, APInt, APInt &)>
  228. SimplifyAndSetOp) const {
  229. // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
  230. // opcode specifying a Top/Bottom instruction, which can change between
  231. // instructions.
  232. auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
  233. unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
  234. unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
  235. // The only odd/even lanes of operand 0 will only be demanded depending
  236. // on whether this is a top/bottom instruction.
  237. APInt DemandedElts =
  238. APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
  239. : APInt::getHighBitsSet(2, 1));
  240. SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
  241. // The other lanes will be defined from the inserted elements.
  242. UndefElts &= APInt::getSplat(NumElts, !IsTop ? APInt::getLowBitsSet(2, 1)
  243. : APInt::getHighBitsSet(2, 1));
  244. return None;
  245. };
  246. switch (II.getIntrinsicID()) {
  247. default:
  248. break;
  249. case Intrinsic::arm_mve_vcvt_narrow:
  250. SimplifyNarrowInstrTopBottom(2);
  251. break;
  252. case Intrinsic::arm_mve_vqmovn:
  253. SimplifyNarrowInstrTopBottom(4);
  254. break;
  255. case Intrinsic::arm_mve_vshrn:
  256. SimplifyNarrowInstrTopBottom(7);
  257. break;
  258. }
  259. return None;
  260. }
  261. InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
  262. TTI::TargetCostKind CostKind) {
  263. assert(Ty->isIntegerTy());
  264. unsigned Bits = Ty->getPrimitiveSizeInBits();
  265. if (Bits == 0 || Imm.getActiveBits() >= 64)
  266. return 4;
  267. int64_t SImmVal = Imm.getSExtValue();
  268. uint64_t ZImmVal = Imm.getZExtValue();
  269. if (!ST->isThumb()) {
  270. if ((SImmVal >= 0 && SImmVal < 65536) ||
  271. (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
  272. (ARM_AM::getSOImmVal(~ZImmVal) != -1))
  273. return 1;
  274. return ST->hasV6T2Ops() ? 2 : 3;
  275. }
  276. if (ST->isThumb2()) {
  277. if ((SImmVal >= 0 && SImmVal < 65536) ||
  278. (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
  279. (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
  280. return 1;
  281. return ST->hasV6T2Ops() ? 2 : 3;
  282. }
  283. // Thumb1, any i8 imm cost 1.
  284. if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
  285. return 1;
  286. if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
  287. return 2;
  288. // Load from constantpool.
  289. return 3;
  290. }
  291. // Constants smaller than 256 fit in the immediate field of
  292. // Thumb1 instructions so we return a zero cost and 1 otherwise.
  293. InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
  294. const APInt &Imm, Type *Ty) {
  295. if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
  296. return 0;
  297. return 1;
  298. }
  299. // Checks whether Inst is part of a min(max()) or max(min()) pattern
  300. // that will match to an SSAT instruction. Returns the instruction being
  301. // saturated, or null if no saturation pattern was found.
  302. static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
  303. Value *LHS, *RHS;
  304. ConstantInt *C;
  305. SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
  306. if (InstSPF == SPF_SMAX &&
  307. PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
  308. C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
  309. auto isSSatMin = [&](Value *MinInst) {
  310. if (isa<SelectInst>(MinInst)) {
  311. Value *MinLHS, *MinRHS;
  312. ConstantInt *MinC;
  313. SelectPatternFlavor MinSPF =
  314. matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
  315. if (MinSPF == SPF_SMIN &&
  316. PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
  317. MinC->getValue() == ((-Imm) - 1))
  318. return true;
  319. }
  320. return false;
  321. };
  322. if (isSSatMin(Inst->getOperand(1)))
  323. return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
  324. if (Inst->hasNUses(2) &&
  325. (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
  326. return Inst->getOperand(1);
  327. }
  328. return nullptr;
  329. }
  330. // Look for a FP Saturation pattern, where the instruction can be simplified to
  331. // a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
  332. static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
  333. if (Imm.getBitWidth() != 64 ||
  334. Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
  335. return false;
  336. Value *FP = isSSATMinMaxPattern(Inst, Imm);
  337. if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
  338. FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm);
  339. if (!FP)
  340. return false;
  341. return isa<FPToSIInst>(FP);
  342. }
  343. InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
  344. const APInt &Imm, Type *Ty,
  345. TTI::TargetCostKind CostKind,
  346. Instruction *Inst) {
  347. // Division by a constant can be turned into multiplication, but only if we
  348. // know it's constant. So it's not so much that the immediate is cheap (it's
  349. // not), but that the alternative is worse.
  350. // FIXME: this is probably unneeded with GlobalISel.
  351. if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
  352. Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
  353. Idx == 1)
  354. return 0;
  355. // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
  356. // splitting any large offsets.
  357. if (Opcode == Instruction::GetElementPtr && Idx != 0)
  358. return 0;
  359. if (Opcode == Instruction::And) {
  360. // UXTB/UXTH
  361. if (Imm == 255 || Imm == 65535)
  362. return 0;
  363. // Conversion to BIC is free, and means we can use ~Imm instead.
  364. return std::min(getIntImmCost(Imm, Ty, CostKind),
  365. getIntImmCost(~Imm, Ty, CostKind));
  366. }
  367. if (Opcode == Instruction::Add)
  368. // Conversion to SUB is free, and means we can use -Imm instead.
  369. return std::min(getIntImmCost(Imm, Ty, CostKind),
  370. getIntImmCost(-Imm, Ty, CostKind));
  371. if (Opcode == Instruction::ICmp && Imm.isNegative() &&
  372. Ty->getIntegerBitWidth() == 32) {
  373. int64_t NegImm = -Imm.getSExtValue();
  374. if (ST->isThumb2() && NegImm < 1<<12)
  375. // icmp X, #-C -> cmn X, #C
  376. return 0;
  377. if (ST->isThumb() && NegImm < 1<<8)
  378. // icmp X, #-C -> adds X, #C
  379. return 0;
  380. }
  381. // xor a, -1 can always be folded to MVN
  382. if (Opcode == Instruction::Xor && Imm.isAllOnes())
  383. return 0;
  384. // Ensures negative constant of min(max()) or max(min()) patterns that
  385. // match to SSAT instructions don't get hoisted
  386. if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
  387. Ty->getIntegerBitWidth() <= 32) {
  388. if (isSSATMinMaxPattern(Inst, Imm) ||
  389. (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
  390. isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
  391. return 0;
  392. }
  393. if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
  394. return 0;
  395. // We can convert <= -1 to < 0, which is generally quite cheap.
  396. if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnesValue()) {
  397. ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
  398. if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
  399. return std::min(getIntImmCost(Imm, Ty, CostKind),
  400. getIntImmCost(Imm + 1, Ty, CostKind));
  401. }
  402. return getIntImmCost(Imm, Ty, CostKind);
  403. }
  404. InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
  405. TTI::TargetCostKind CostKind,
  406. const Instruction *I) {
  407. if (CostKind == TTI::TCK_RecipThroughput &&
  408. (ST->hasNEON() || ST->hasMVEIntegerOps())) {
  409. // FIXME: The vectorizer is highly sensistive to the cost of these
  410. // instructions, which suggests that it may be using the costs incorrectly.
  411. // But, for now, just make them free to avoid performance regressions for
  412. // vector targets.
  413. return 0;
  414. }
  415. return BaseT::getCFInstrCost(Opcode, CostKind, I);
  416. }
  417. InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
  418. Type *Src,
  419. TTI::CastContextHint CCH,
  420. TTI::TargetCostKind CostKind,
  421. const Instruction *I) {
  422. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  423. assert(ISD && "Invalid opcode");
  424. // TODO: Allow non-throughput costs that aren't binary.
  425. auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
  426. if (CostKind != TTI::TCK_RecipThroughput)
  427. return Cost == 0 ? 0 : 1;
  428. return Cost;
  429. };
  430. auto IsLegalFPType = [this](EVT VT) {
  431. EVT EltVT = VT.getScalarType();
  432. return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
  433. (EltVT == MVT::f64 && ST->hasFP64()) ||
  434. (EltVT == MVT::f16 && ST->hasFullFP16());
  435. };
  436. EVT SrcTy = TLI->getValueType(DL, Src);
  437. EVT DstTy = TLI->getValueType(DL, Dst);
  438. if (!SrcTy.isSimple() || !DstTy.isSimple())
  439. return AdjustCost(
  440. BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
  441. // Extending masked load/Truncating masked stores is expensive because we
  442. // currently don't split them. This means that we'll likely end up
  443. // loading/storing each element individually (hence the high cost).
  444. if ((ST->hasMVEIntegerOps() &&
  445. (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
  446. Opcode == Instruction::SExt)) ||
  447. (ST->hasMVEFloatOps() &&
  448. (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
  449. IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
  450. if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
  451. return 2 * DstTy.getVectorNumElements() *
  452. ST->getMVEVectorCostFactor(CostKind);
  453. // The extend of other kinds of load is free
  454. if (CCH == TTI::CastContextHint::Normal ||
  455. CCH == TTI::CastContextHint::Masked) {
  456. static const TypeConversionCostTblEntry LoadConversionTbl[] = {
  457. {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
  458. {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
  459. {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
  460. {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
  461. {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
  462. {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
  463. {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
  464. {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
  465. {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
  466. {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
  467. {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
  468. {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
  469. };
  470. if (const auto *Entry = ConvertCostTableLookup(
  471. LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
  472. return AdjustCost(Entry->Cost);
  473. static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
  474. {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
  475. {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
  476. {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
  477. {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
  478. {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
  479. {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
  480. // The following extend from a legal type to an illegal type, so need to
  481. // split the load. This introduced an extra load operation, but the
  482. // extend is still "free".
  483. {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
  484. {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
  485. {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
  486. {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
  487. {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
  488. {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
  489. };
  490. if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
  491. if (const auto *Entry =
  492. ConvertCostTableLookup(MVELoadConversionTbl, ISD,
  493. DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
  494. return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
  495. }
  496. static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
  497. // FPExtends are similar but also require the VCVT instructions.
  498. {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
  499. {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
  500. };
  501. if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
  502. if (const auto *Entry =
  503. ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
  504. DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
  505. return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
  506. }
  507. // The truncate of a store is free. This is the mirror of extends above.
  508. static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
  509. {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
  510. {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
  511. {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
  512. {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
  513. {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
  514. {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
  515. {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
  516. };
  517. if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
  518. if (const auto *Entry =
  519. ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
  520. SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
  521. return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
  522. }
  523. static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
  524. {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
  525. {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
  526. };
  527. if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
  528. if (const auto *Entry =
  529. ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
  530. SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
  531. return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
  532. }
  533. }
  534. // NEON vector operations that can extend their inputs.
  535. if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
  536. I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
  537. static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
  538. // vaddl
  539. { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
  540. { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
  541. // vsubl
  542. { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
  543. { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
  544. // vmull
  545. { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
  546. { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
  547. // vshll
  548. { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
  549. { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
  550. };
  551. auto *User = cast<Instruction>(*I->user_begin());
  552. int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
  553. if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
  554. DstTy.getSimpleVT(),
  555. SrcTy.getSimpleVT())) {
  556. return AdjustCost(Entry->Cost);
  557. }
  558. }
  559. // Single to/from double precision conversions.
  560. if (Src->isVectorTy() && ST->hasNEON() &&
  561. ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
  562. DstTy.getScalarType() == MVT::f32) ||
  563. (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
  564. DstTy.getScalarType() == MVT::f64))) {
  565. static const CostTblEntry NEONFltDblTbl[] = {
  566. // Vector fptrunc/fpext conversions.
  567. {ISD::FP_ROUND, MVT::v2f64, 2},
  568. {ISD::FP_EXTEND, MVT::v2f32, 2},
  569. {ISD::FP_EXTEND, MVT::v4f32, 4}};
  570. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
  571. if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
  572. return AdjustCost(LT.first * Entry->Cost);
  573. }
  574. // Some arithmetic, load and store operations have specific instructions
  575. // to cast up/down their types automatically at no extra cost.
  576. // TODO: Get these tables to know at least what the related operations are.
  577. static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
  578. { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
  579. { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
  580. { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
  581. { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
  582. { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
  583. { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
  584. // The number of vmovl instructions for the extension.
  585. { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
  586. { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
  587. { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
  588. { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
  589. { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
  590. { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
  591. { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
  592. { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
  593. { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
  594. { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
  595. { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
  596. { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
  597. { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
  598. { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
  599. { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
  600. { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
  601. { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
  602. { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
  603. // Operations that we legalize using splitting.
  604. { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
  605. { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
  606. // Vector float <-> i32 conversions.
  607. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
  608. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
  609. { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
  610. { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
  611. { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
  612. { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
  613. { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
  614. { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
  615. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
  616. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
  617. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
  618. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
  619. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
  620. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
  621. { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
  622. { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
  623. { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
  624. { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
  625. { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
  626. { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
  627. { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
  628. { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
  629. { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
  630. { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
  631. { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
  632. { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
  633. { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
  634. { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
  635. // Vector double <-> i32 conversions.
  636. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
  637. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
  638. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
  639. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
  640. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
  641. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
  642. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
  643. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
  644. { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
  645. { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
  646. { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
  647. { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
  648. { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
  649. { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
  650. };
  651. if (SrcTy.isVector() && ST->hasNEON()) {
  652. if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
  653. DstTy.getSimpleVT(),
  654. SrcTy.getSimpleVT()))
  655. return AdjustCost(Entry->Cost);
  656. }
  657. // Scalar float to integer conversions.
  658. static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
  659. { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
  660. { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
  661. { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
  662. { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
  663. { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
  664. { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
  665. { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
  666. { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
  667. { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
  668. { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
  669. { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
  670. { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
  671. { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
  672. { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
  673. { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
  674. { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
  675. { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
  676. { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
  677. { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
  678. { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
  679. };
  680. if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
  681. if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
  682. DstTy.getSimpleVT(),
  683. SrcTy.getSimpleVT()))
  684. return AdjustCost(Entry->Cost);
  685. }
  686. // Scalar integer to float conversions.
  687. static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
  688. { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
  689. { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
  690. { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
  691. { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
  692. { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
  693. { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
  694. { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
  695. { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
  696. { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
  697. { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
  698. { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
  699. { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
  700. { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
  701. { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
  702. { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
  703. { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
  704. { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
  705. { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
  706. { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
  707. { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
  708. };
  709. if (SrcTy.isInteger() && ST->hasNEON()) {
  710. if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
  711. ISD, DstTy.getSimpleVT(),
  712. SrcTy.getSimpleVT()))
  713. return AdjustCost(Entry->Cost);
  714. }
  715. // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
  716. // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
  717. // are linearised so take more.
  718. static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
  719. { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
  720. { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
  721. { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
  722. { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
  723. { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
  724. { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
  725. { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
  726. { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
  727. { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
  728. { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
  729. { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
  730. { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
  731. };
  732. if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
  733. if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
  734. ISD, DstTy.getSimpleVT(),
  735. SrcTy.getSimpleVT()))
  736. return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
  737. }
  738. if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
  739. // As general rule, fp converts that were not matched above are scalarized
  740. // and cost 1 vcvt for each lane, so long as the instruction is available.
  741. // If not it will become a series of function calls.
  742. const InstructionCost CallCost =
  743. getCallInstrCost(nullptr, Dst, {Src}, CostKind);
  744. int Lanes = 1;
  745. if (SrcTy.isFixedLengthVector())
  746. Lanes = SrcTy.getVectorNumElements();
  747. if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
  748. return Lanes;
  749. else
  750. return Lanes * CallCost;
  751. }
  752. if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
  753. SrcTy.isFixedLengthVector()) {
  754. // Treat a truncate with larger than legal source (128bits for MVE) as
  755. // expensive, 2 instructions per lane.
  756. if ((SrcTy.getScalarType() == MVT::i8 ||
  757. SrcTy.getScalarType() == MVT::i16 ||
  758. SrcTy.getScalarType() == MVT::i32) &&
  759. SrcTy.getSizeInBits() > 128 &&
  760. SrcTy.getSizeInBits() > DstTy.getSizeInBits())
  761. return SrcTy.getVectorNumElements() * 2;
  762. }
  763. // Scalar integer conversion costs.
  764. static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
  765. // i16 -> i64 requires two dependent operations.
  766. { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
  767. // Truncates on i64 are assumed to be free.
  768. { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
  769. { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
  770. { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
  771. { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
  772. };
  773. if (SrcTy.isInteger()) {
  774. if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
  775. DstTy.getSimpleVT(),
  776. SrcTy.getSimpleVT()))
  777. return AdjustCost(Entry->Cost);
  778. }
  779. int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
  780. ? ST->getMVEVectorCostFactor(CostKind)
  781. : 1;
  782. return AdjustCost(
  783. BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
  784. }
  785. InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
  786. unsigned Index) {
  787. // Penalize inserting into an D-subregister. We end up with a three times
  788. // lower estimated throughput on swift.
  789. if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
  790. ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
  791. return 3;
  792. if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
  793. Opcode == Instruction::ExtractElement)) {
  794. // Cross-class copies are expensive on many microarchitectures,
  795. // so assume they are expensive by default.
  796. if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
  797. return 3;
  798. // Even if it's not a cross class copy, this likely leads to mixing
  799. // of NEON and VFP code and should be therefore penalized.
  800. if (ValTy->isVectorTy() &&
  801. ValTy->getScalarSizeInBits() <= 32)
  802. return std::max<InstructionCost>(
  803. BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
  804. }
  805. if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
  806. Opcode == Instruction::ExtractElement)) {
  807. // Integer cross-lane moves are more expensive than float, which can
  808. // sometimes just be vmovs. Integer involve being passes to GPR registers,
  809. // causing more of a delay.
  810. std::pair<InstructionCost, MVT> LT =
  811. getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType());
  812. return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
  813. }
  814. return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
  815. }
  816. InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
  817. Type *CondTy,
  818. CmpInst::Predicate VecPred,
  819. TTI::TargetCostKind CostKind,
  820. const Instruction *I) {
  821. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  822. // Thumb scalar code size cost for select.
  823. if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
  824. ST->isThumb() && !ValTy->isVectorTy()) {
  825. // Assume expensive structs.
  826. if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
  827. return TTI::TCC_Expensive;
  828. // Select costs can vary because they:
  829. // - may require one or more conditional mov (including an IT),
  830. // - can't operate directly on immediates,
  831. // - require live flags, which we can't copy around easily.
  832. InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
  833. // Possible IT instruction for Thumb2, or more for Thumb1.
  834. ++Cost;
  835. // i1 values may need rematerialising by using mov immediates and/or
  836. // flag setting instructions.
  837. if (ValTy->isIntegerTy(1))
  838. ++Cost;
  839. return Cost;
  840. }
  841. // If this is a vector min/max/abs, use the cost of that intrinsic directly
  842. // instead. Hopefully when min/max intrinsics are more prevalent this code
  843. // will not be needed.
  844. const Instruction *Sel = I;
  845. if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
  846. Sel->hasOneUse())
  847. Sel = cast<Instruction>(Sel->user_back());
  848. if (Sel && ValTy->isVectorTy() &&
  849. (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
  850. const Value *LHS, *RHS;
  851. SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
  852. unsigned IID = 0;
  853. switch (SPF) {
  854. case SPF_ABS:
  855. IID = Intrinsic::abs;
  856. break;
  857. case SPF_SMIN:
  858. IID = Intrinsic::smin;
  859. break;
  860. case SPF_SMAX:
  861. IID = Intrinsic::smax;
  862. break;
  863. case SPF_UMIN:
  864. IID = Intrinsic::umin;
  865. break;
  866. case SPF_UMAX:
  867. IID = Intrinsic::umax;
  868. break;
  869. case SPF_FMINNUM:
  870. IID = Intrinsic::minnum;
  871. break;
  872. case SPF_FMAXNUM:
  873. IID = Intrinsic::maxnum;
  874. break;
  875. default:
  876. break;
  877. }
  878. if (IID) {
  879. // The ICmp is free, the select gets the cost of the min/max/etc
  880. if (Sel != I)
  881. return 0;
  882. IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
  883. return getIntrinsicInstrCost(CostAttrs, CostKind);
  884. }
  885. }
  886. // On NEON a vector select gets lowered to vbsl.
  887. if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
  888. // Lowering of some vector selects is currently far from perfect.
  889. static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
  890. { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
  891. { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
  892. { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
  893. };
  894. EVT SelCondTy = TLI->getValueType(DL, CondTy);
  895. EVT SelValTy = TLI->getValueType(DL, ValTy);
  896. if (SelCondTy.isSimple() && SelValTy.isSimple()) {
  897. if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
  898. SelCondTy.getSimpleVT(),
  899. SelValTy.getSimpleVT()))
  900. return Entry->Cost;
  901. }
  902. std::pair<InstructionCost, MVT> LT =
  903. TLI->getTypeLegalizationCost(DL, ValTy);
  904. return LT.first;
  905. }
  906. if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
  907. (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
  908. cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
  909. FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
  910. FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
  911. if (!VecCondTy)
  912. VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
  913. // If we don't have mve.fp any fp operations will need to be scalarized.
  914. if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
  915. // One scalaization insert, one scalarization extract and the cost of the
  916. // fcmps.
  917. return BaseT::getScalarizationOverhead(VecValTy, false, true) +
  918. BaseT::getScalarizationOverhead(VecCondTy, true, false) +
  919. VecValTy->getNumElements() *
  920. getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
  921. VecCondTy->getScalarType(), VecPred, CostKind,
  922. I);
  923. }
  924. std::pair<InstructionCost, MVT> LT =
  925. TLI->getTypeLegalizationCost(DL, ValTy);
  926. int BaseCost = ST->getMVEVectorCostFactor(CostKind);
  927. // There are two types - the input that specifies the type of the compare
  928. // and the output vXi1 type. Because we don't know how the output will be
  929. // split, we may need an expensive shuffle to get two in sync. This has the
  930. // effect of making larger than legal compares (v8i32 for example)
  931. // expensive.
  932. if (LT.second.getVectorNumElements() > 2) {
  933. if (LT.first > 1)
  934. return LT.first * BaseCost +
  935. BaseT::getScalarizationOverhead(VecCondTy, true, false);
  936. return BaseCost;
  937. }
  938. }
  939. // Default to cheap (throughput/size of 1 instruction) but adjust throughput
  940. // for "multiple beats" potentially needed by MVE instructions.
  941. int BaseCost = 1;
  942. if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
  943. BaseCost = ST->getMVEVectorCostFactor(CostKind);
  944. return BaseCost *
  945. BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
  946. }
  947. InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
  948. ScalarEvolution *SE,
  949. const SCEV *Ptr) {
  950. // Address computations in vectorized code with non-consecutive addresses will
  951. // likely result in more instructions compared to scalar code where the
  952. // computation can more often be merged into the index mode. The resulting
  953. // extra micro-ops can significantly decrease throughput.
  954. unsigned NumVectorInstToHideOverhead = 10;
  955. int MaxMergeDistance = 64;
  956. if (ST->hasNEON()) {
  957. if (Ty->isVectorTy() && SE &&
  958. !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
  959. return NumVectorInstToHideOverhead;
  960. // In many cases the address computation is not merged into the instruction
  961. // addressing mode.
  962. return 1;
  963. }
  964. return BaseT::getAddressComputationCost(Ty, SE, Ptr);
  965. }
  966. bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
  967. if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
  968. // If a VCTP is part of a chain, it's already profitable and shouldn't be
  969. // optimized, else LSR may block tail-predication.
  970. switch (II->getIntrinsicID()) {
  971. case Intrinsic::arm_mve_vctp8:
  972. case Intrinsic::arm_mve_vctp16:
  973. case Intrinsic::arm_mve_vctp32:
  974. case Intrinsic::arm_mve_vctp64:
  975. return true;
  976. default:
  977. break;
  978. }
  979. }
  980. return false;
  981. }
  982. bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
  983. if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
  984. return false;
  985. if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
  986. // Don't support v2i1 yet.
  987. if (VecTy->getNumElements() == 2)
  988. return false;
  989. // We don't support extending fp types.
  990. unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
  991. if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
  992. return false;
  993. }
  994. unsigned EltWidth = DataTy->getScalarSizeInBits();
  995. return (EltWidth == 32 && Alignment >= 4) ||
  996. (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
  997. }
  998. bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
  999. if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
  1000. return false;
  1001. unsigned EltWidth = Ty->getScalarSizeInBits();
  1002. return ((EltWidth == 32 && Alignment >= 4) ||
  1003. (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
  1004. }
  1005. /// Given a memcpy/memset/memmove instruction, return the number of memory
  1006. /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
  1007. /// call is used.
  1008. int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
  1009. MemOp MOp;
  1010. unsigned DstAddrSpace = ~0u;
  1011. unsigned SrcAddrSpace = ~0u;
  1012. const Function *F = I->getParent()->getParent();
  1013. if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
  1014. ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
  1015. // If 'size' is not a constant, a library call will be generated.
  1016. if (!C)
  1017. return -1;
  1018. const unsigned Size = C->getValue().getZExtValue();
  1019. const Align DstAlign = *MC->getDestAlign();
  1020. const Align SrcAlign = *MC->getSourceAlign();
  1021. MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
  1022. /*IsVolatile*/ false);
  1023. DstAddrSpace = MC->getDestAddressSpace();
  1024. SrcAddrSpace = MC->getSourceAddressSpace();
  1025. }
  1026. else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
  1027. ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
  1028. // If 'size' is not a constant, a library call will be generated.
  1029. if (!C)
  1030. return -1;
  1031. const unsigned Size = C->getValue().getZExtValue();
  1032. const Align DstAlign = *MS->getDestAlign();
  1033. MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
  1034. /*IsZeroMemset*/ false, /*IsVolatile*/ false);
  1035. DstAddrSpace = MS->getDestAddressSpace();
  1036. }
  1037. else
  1038. llvm_unreachable("Expected a memcpy/move or memset!");
  1039. unsigned Limit, Factor = 2;
  1040. switch(I->getIntrinsicID()) {
  1041. case Intrinsic::memcpy:
  1042. Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
  1043. break;
  1044. case Intrinsic::memmove:
  1045. Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
  1046. break;
  1047. case Intrinsic::memset:
  1048. Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
  1049. Factor = 1;
  1050. break;
  1051. default:
  1052. llvm_unreachable("Expected a memcpy/move or memset!");
  1053. }
  1054. // MemOps will be poplulated with a list of data types that needs to be
  1055. // loaded and stored. That's why we multiply the number of elements by 2 to
  1056. // get the cost for this memcpy.
  1057. std::vector<EVT> MemOps;
  1058. if (getTLI()->findOptimalMemOpLowering(
  1059. MemOps, Limit, MOp, DstAddrSpace,
  1060. SrcAddrSpace, F->getAttributes()))
  1061. return MemOps.size() * Factor;
  1062. // If we can't find an optimal memop lowering, return the default cost
  1063. return -1;
  1064. }
  1065. InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
  1066. int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
  1067. // To model the cost of a library call, we assume 1 for the call, and
  1068. // 3 for the argument setup.
  1069. if (NumOps == -1)
  1070. return 4;
  1071. return NumOps;
  1072. }
  1073. InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
  1074. VectorType *Tp, ArrayRef<int> Mask,
  1075. int Index, VectorType *SubTp) {
  1076. Kind = improveShuffleKindFromMask(Kind, Mask);
  1077. if (ST->hasNEON()) {
  1078. if (Kind == TTI::SK_Broadcast) {
  1079. static const CostTblEntry NEONDupTbl[] = {
  1080. // VDUP handles these cases.
  1081. {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
  1082. {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
  1083. {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
  1084. {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
  1085. {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
  1086. {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
  1087. {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
  1088. {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
  1089. {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
  1090. {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
  1091. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
  1092. if (const auto *Entry =
  1093. CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
  1094. return LT.first * Entry->Cost;
  1095. }
  1096. if (Kind == TTI::SK_Reverse) {
  1097. static const CostTblEntry NEONShuffleTbl[] = {
  1098. // Reverse shuffle cost one instruction if we are shuffling within a
  1099. // double word (vrev) or two if we shuffle a quad word (vrev, vext).
  1100. {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
  1101. {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
  1102. {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
  1103. {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
  1104. {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
  1105. {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
  1106. {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
  1107. {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
  1108. {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
  1109. {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
  1110. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
  1111. if (const auto *Entry =
  1112. CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
  1113. return LT.first * Entry->Cost;
  1114. }
  1115. if (Kind == TTI::SK_Select) {
  1116. static const CostTblEntry NEONSelShuffleTbl[] = {
  1117. // Select shuffle cost table for ARM. Cost is the number of
  1118. // instructions
  1119. // required to create the shuffled vector.
  1120. {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
  1121. {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
  1122. {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
  1123. {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
  1124. {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
  1125. {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
  1126. {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
  1127. {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
  1128. {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
  1129. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
  1130. if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
  1131. ISD::VECTOR_SHUFFLE, LT.second))
  1132. return LT.first * Entry->Cost;
  1133. }
  1134. }
  1135. if (ST->hasMVEIntegerOps()) {
  1136. if (Kind == TTI::SK_Broadcast) {
  1137. static const CostTblEntry MVEDupTbl[] = {
  1138. // VDUP handles these cases.
  1139. {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
  1140. {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
  1141. {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
  1142. {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
  1143. {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
  1144. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
  1145. if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
  1146. LT.second))
  1147. return LT.first * Entry->Cost *
  1148. ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput);
  1149. }
  1150. if (!Mask.empty()) {
  1151. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
  1152. if (LT.second.isVector() &&
  1153. Mask.size() <= LT.second.getVectorNumElements() &&
  1154. (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
  1155. isVREVMask(Mask, LT.second, 64)))
  1156. return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
  1157. }
  1158. }
  1159. int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
  1160. ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
  1161. : 1;
  1162. return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
  1163. }
  1164. InstructionCost ARMTTIImpl::getArithmeticInstrCost(
  1165. unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
  1166. TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
  1167. TTI::OperandValueProperties Opd1PropInfo,
  1168. TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
  1169. const Instruction *CxtI) {
  1170. int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
  1171. if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
  1172. // Make operations on i1 relatively expensive as this often involves
  1173. // combining predicates. AND and XOR should be easier to handle with IT
  1174. // blocks.
  1175. switch (ISDOpcode) {
  1176. default:
  1177. break;
  1178. case ISD::AND:
  1179. case ISD::XOR:
  1180. return 2;
  1181. case ISD::OR:
  1182. return 3;
  1183. }
  1184. }
  1185. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
  1186. if (ST->hasNEON()) {
  1187. const unsigned FunctionCallDivCost = 20;
  1188. const unsigned ReciprocalDivCost = 10;
  1189. static const CostTblEntry CostTbl[] = {
  1190. // Division.
  1191. // These costs are somewhat random. Choose a cost of 20 to indicate that
  1192. // vectorizing devision (added function call) is going to be very expensive.
  1193. // Double registers types.
  1194. { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
  1195. { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
  1196. { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
  1197. { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
  1198. { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
  1199. { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
  1200. { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
  1201. { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
  1202. { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
  1203. { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
  1204. { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
  1205. { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
  1206. { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
  1207. { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
  1208. { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
  1209. { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
  1210. // Quad register types.
  1211. { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
  1212. { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
  1213. { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
  1214. { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
  1215. { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
  1216. { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
  1217. { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
  1218. { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
  1219. { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
  1220. { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
  1221. { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
  1222. { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
  1223. { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
  1224. { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
  1225. { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
  1226. { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
  1227. // Multiplication.
  1228. };
  1229. if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
  1230. return LT.first * Entry->Cost;
  1231. InstructionCost Cost = BaseT::getArithmeticInstrCost(
  1232. Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
  1233. // This is somewhat of a hack. The problem that we are facing is that SROA
  1234. // creates a sequence of shift, and, or instructions to construct values.
  1235. // These sequences are recognized by the ISel and have zero-cost. Not so for
  1236. // the vectorized code. Because we have support for v2i64 but not i64 those
  1237. // sequences look particularly beneficial to vectorize.
  1238. // To work around this we increase the cost of v2i64 operations to make them
  1239. // seem less beneficial.
  1240. if (LT.second == MVT::v2i64 &&
  1241. Op2Info == TargetTransformInfo::OK_UniformConstantValue)
  1242. Cost += 4;
  1243. return Cost;
  1244. }
  1245. // If this operation is a shift on arm/thumb2, it might well be folded into
  1246. // the following instruction, hence having a cost of 0.
  1247. auto LooksLikeAFreeShift = [&]() {
  1248. if (ST->isThumb1Only() || Ty->isVectorTy())
  1249. return false;
  1250. if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
  1251. return false;
  1252. if (Op2Info != TargetTransformInfo::OK_UniformConstantValue)
  1253. return false;
  1254. // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
  1255. switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
  1256. case Instruction::Add:
  1257. case Instruction::Sub:
  1258. case Instruction::And:
  1259. case Instruction::Xor:
  1260. case Instruction::Or:
  1261. case Instruction::ICmp:
  1262. return true;
  1263. default:
  1264. return false;
  1265. }
  1266. };
  1267. if (LooksLikeAFreeShift())
  1268. return 0;
  1269. // Default to cheap (throughput/size of 1 instruction) but adjust throughput
  1270. // for "multiple beats" potentially needed by MVE instructions.
  1271. int BaseCost = 1;
  1272. if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
  1273. BaseCost = ST->getMVEVectorCostFactor(CostKind);
  1274. // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
  1275. // without treating floats as more expensive that scalars or increasing the
  1276. // costs for custom operations. The results is also multiplied by the
  1277. // MVEVectorCostFactor where appropriate.
  1278. if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
  1279. return LT.first * BaseCost;
  1280. // Else this is expand, assume that we need to scalarize this op.
  1281. if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
  1282. unsigned Num = VTy->getNumElements();
  1283. InstructionCost Cost =
  1284. getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
  1285. // Return the cost of multiple scalar invocation plus the cost of
  1286. // inserting and extracting the values.
  1287. SmallVector<Type *> Tys(Args.size(), Ty);
  1288. return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
  1289. }
  1290. return BaseCost;
  1291. }
  1292. InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
  1293. MaybeAlign Alignment,
  1294. unsigned AddressSpace,
  1295. TTI::TargetCostKind CostKind,
  1296. const Instruction *I) {
  1297. // TODO: Handle other cost kinds.
  1298. if (CostKind != TTI::TCK_RecipThroughput)
  1299. return 1;
  1300. // Type legalization can't handle structs
  1301. if (TLI->getValueType(DL, Src, true) == MVT::Other)
  1302. return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
  1303. CostKind);
  1304. if (ST->hasNEON() && Src->isVectorTy() &&
  1305. (Alignment && *Alignment != Align(16)) &&
  1306. cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
  1307. // Unaligned loads/stores are extremely inefficient.
  1308. // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
  1309. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
  1310. return LT.first * 4;
  1311. }
  1312. // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
  1313. // Same for stores.
  1314. if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
  1315. ((Opcode == Instruction::Load && I->hasOneUse() &&
  1316. isa<FPExtInst>(*I->user_begin())) ||
  1317. (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
  1318. FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
  1319. Type *DstTy =
  1320. Opcode == Instruction::Load
  1321. ? (*I->user_begin())->getType()
  1322. : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
  1323. if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
  1324. DstTy->getScalarType()->isFloatTy())
  1325. return ST->getMVEVectorCostFactor(CostKind);
  1326. }
  1327. int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
  1328. ? ST->getMVEVectorCostFactor(CostKind)
  1329. : 1;
  1330. return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
  1331. CostKind, I);
  1332. }
  1333. InstructionCost
  1334. ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
  1335. unsigned AddressSpace,
  1336. TTI::TargetCostKind CostKind) {
  1337. if (ST->hasMVEIntegerOps()) {
  1338. if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
  1339. return ST->getMVEVectorCostFactor(CostKind);
  1340. if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
  1341. return ST->getMVEVectorCostFactor(CostKind);
  1342. }
  1343. if (!isa<FixedVectorType>(Src))
  1344. return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
  1345. CostKind);
  1346. // Scalar cost, which is currently very high due to the efficiency of the
  1347. // generated code.
  1348. return cast<FixedVectorType>(Src)->getNumElements() * 8;
  1349. }
  1350. InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
  1351. unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
  1352. Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
  1353. bool UseMaskForCond, bool UseMaskForGaps) {
  1354. assert(Factor >= 2 && "Invalid interleave factor");
  1355. assert(isa<VectorType>(VecTy) && "Expect a vector type");
  1356. // vldN/vstN doesn't support vector types of i64/f64 element.
  1357. bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
  1358. if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
  1359. !UseMaskForCond && !UseMaskForGaps) {
  1360. unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
  1361. auto *SubVecTy =
  1362. FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
  1363. // vldN/vstN only support legal vector types of size 64 or 128 in bits.
  1364. // Accesses having vector types that are a multiple of 128 bits can be
  1365. // matched to more than one vldN/vstN instruction.
  1366. int BaseCost =
  1367. ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
  1368. if (NumElts % Factor == 0 &&
  1369. TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
  1370. return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
  1371. // Some smaller than legal interleaved patterns are cheap as we can make
  1372. // use of the vmovn or vrev patterns to interleave a standard load. This is
  1373. // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
  1374. // promoted differently). The cost of 2 here is then a load and vrev or
  1375. // vmovn.
  1376. if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
  1377. VecTy->isIntOrIntVectorTy() &&
  1378. DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
  1379. return 2 * BaseCost;
  1380. }
  1381. return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
  1382. Alignment, AddressSpace, CostKind,
  1383. UseMaskForCond, UseMaskForGaps);
  1384. }
  1385. InstructionCost ARMTTIImpl::getGatherScatterOpCost(
  1386. unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
  1387. Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
  1388. using namespace PatternMatch;
  1389. if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
  1390. return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
  1391. Alignment, CostKind, I);
  1392. assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
  1393. auto *VTy = cast<FixedVectorType>(DataTy);
  1394. // TODO: Splitting, once we do that.
  1395. unsigned NumElems = VTy->getNumElements();
  1396. unsigned EltSize = VTy->getScalarSizeInBits();
  1397. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
  1398. // For now, it is assumed that for the MVE gather instructions the loads are
  1399. // all effectively serialised. This means the cost is the scalar cost
  1400. // multiplied by the number of elements being loaded. This is possibly very
  1401. // conservative, but even so we still end up vectorising loops because the
  1402. // cost per iteration for many loops is lower than for scalar loops.
  1403. InstructionCost VectorCost =
  1404. NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
  1405. // The scalarization cost should be a lot higher. We use the number of vector
  1406. // elements plus the scalarization overhead.
  1407. InstructionCost ScalarCost =
  1408. NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) +
  1409. BaseT::getScalarizationOverhead(VTy, false, true);
  1410. if (EltSize < 8 || Alignment < EltSize / 8)
  1411. return ScalarCost;
  1412. unsigned ExtSize = EltSize;
  1413. // Check whether there's a single user that asks for an extended type
  1414. if (I != nullptr) {
  1415. // Dependent of the caller of this function, a gather instruction will
  1416. // either have opcode Instruction::Load or be a call to the masked_gather
  1417. // intrinsic
  1418. if ((I->getOpcode() == Instruction::Load ||
  1419. match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
  1420. I->hasOneUse()) {
  1421. const User *Us = *I->users().begin();
  1422. if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
  1423. // only allow valid type combinations
  1424. unsigned TypeSize =
  1425. cast<Instruction>(Us)->getType()->getScalarSizeInBits();
  1426. if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
  1427. (TypeSize == 16 && EltSize == 8)) &&
  1428. TypeSize * NumElems == 128) {
  1429. ExtSize = TypeSize;
  1430. }
  1431. }
  1432. }
  1433. // Check whether the input data needs to be truncated
  1434. TruncInst *T;
  1435. if ((I->getOpcode() == Instruction::Store ||
  1436. match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
  1437. (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
  1438. // Only allow valid type combinations
  1439. unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
  1440. if (((EltSize == 16 && TypeSize == 32) ||
  1441. (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
  1442. TypeSize * NumElems == 128)
  1443. ExtSize = TypeSize;
  1444. }
  1445. }
  1446. if (ExtSize * NumElems != 128 || NumElems < 4)
  1447. return ScalarCost;
  1448. // Any (aligned) i32 gather will not need to be scalarised.
  1449. if (ExtSize == 32)
  1450. return VectorCost;
  1451. // For smaller types, we need to ensure that the gep's inputs are correctly
  1452. // extended from a small enough value. Other sizes (including i64) are
  1453. // scalarized for now.
  1454. if (ExtSize != 8 && ExtSize != 16)
  1455. return ScalarCost;
  1456. if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
  1457. Ptr = BC->getOperand(0);
  1458. if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
  1459. if (GEP->getNumOperands() != 2)
  1460. return ScalarCost;
  1461. unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
  1462. // Scale needs to be correct (which is only relevant for i16s).
  1463. if (Scale != 1 && Scale * 8 != ExtSize)
  1464. return ScalarCost;
  1465. // And we need to zext (not sext) the indexes from a small enough type.
  1466. if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
  1467. if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
  1468. return VectorCost;
  1469. }
  1470. return ScalarCost;
  1471. }
  1472. return ScalarCost;
  1473. }
  1474. InstructionCost
  1475. ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
  1476. Optional<FastMathFlags> FMF,
  1477. TTI::TargetCostKind CostKind) {
  1478. if (TTI::requiresOrderedReduction(FMF))
  1479. return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
  1480. EVT ValVT = TLI->getValueType(DL, ValTy);
  1481. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  1482. if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
  1483. return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
  1484. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
  1485. static const CostTblEntry CostTblAdd[]{
  1486. {ISD::ADD, MVT::v16i8, 1},
  1487. {ISD::ADD, MVT::v8i16, 1},
  1488. {ISD::ADD, MVT::v4i32, 1},
  1489. };
  1490. if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
  1491. return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
  1492. return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
  1493. }
  1494. InstructionCost
  1495. ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
  1496. Type *ResTy, VectorType *ValTy,
  1497. TTI::TargetCostKind CostKind) {
  1498. EVT ValVT = TLI->getValueType(DL, ValTy);
  1499. EVT ResVT = TLI->getValueType(DL, ResTy);
  1500. if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
  1501. std::pair<InstructionCost, MVT> LT =
  1502. TLI->getTypeLegalizationCost(DL, ValTy);
  1503. // The legal cases are:
  1504. // VADDV u/s 8/16/32
  1505. // VMLAV u/s 8/16/32
  1506. // VADDLV u/s 32
  1507. // VMLALV u/s 16/32
  1508. // Codegen currently cannot always handle larger than legal vectors very
  1509. // well, especially for predicated reductions where the mask needs to be
  1510. // split, so restrict to 128bit or smaller input types.
  1511. unsigned RevVTSize = ResVT.getSizeInBits();
  1512. if (ValVT.getSizeInBits() <= 128 &&
  1513. ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
  1514. (LT.second == MVT::v8i16 && RevVTSize <= (IsMLA ? 64u : 32u)) ||
  1515. (LT.second == MVT::v4i32 && RevVTSize <= 64)))
  1516. return ST->getMVEVectorCostFactor(CostKind) * LT.first;
  1517. }
  1518. return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
  1519. CostKind);
  1520. }
  1521. InstructionCost
  1522. ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
  1523. TTI::TargetCostKind CostKind) {
  1524. switch (ICA.getID()) {
  1525. case Intrinsic::get_active_lane_mask:
  1526. // Currently we make a somewhat optimistic assumption that
  1527. // active_lane_mask's are always free. In reality it may be freely folded
  1528. // into a tail predicated loop, expanded into a VCPT or expanded into a lot
  1529. // of add/icmp code. We may need to improve this in the future, but being
  1530. // able to detect if it is free or not involves looking at a lot of other
  1531. // code. We currently assume that the vectorizer inserted these, and knew
  1532. // what it was doing in adding one.
  1533. if (ST->hasMVEIntegerOps())
  1534. return 0;
  1535. break;
  1536. case Intrinsic::sadd_sat:
  1537. case Intrinsic::ssub_sat:
  1538. case Intrinsic::uadd_sat:
  1539. case Intrinsic::usub_sat: {
  1540. if (!ST->hasMVEIntegerOps())
  1541. break;
  1542. Type *VT = ICA.getReturnType();
  1543. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
  1544. if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
  1545. LT.second == MVT::v16i8) {
  1546. // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
  1547. // need to extend the type, as it uses shr(qadd(shl, shl)).
  1548. unsigned Instrs =
  1549. LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
  1550. return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
  1551. }
  1552. break;
  1553. }
  1554. case Intrinsic::abs:
  1555. case Intrinsic::smin:
  1556. case Intrinsic::smax:
  1557. case Intrinsic::umin:
  1558. case Intrinsic::umax: {
  1559. if (!ST->hasMVEIntegerOps())
  1560. break;
  1561. Type *VT = ICA.getReturnType();
  1562. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
  1563. if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
  1564. LT.second == MVT::v16i8)
  1565. return LT.first * ST->getMVEVectorCostFactor(CostKind);
  1566. break;
  1567. }
  1568. case Intrinsic::minnum:
  1569. case Intrinsic::maxnum: {
  1570. if (!ST->hasMVEFloatOps())
  1571. break;
  1572. Type *VT = ICA.getReturnType();
  1573. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
  1574. if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
  1575. return LT.first * ST->getMVEVectorCostFactor(CostKind);
  1576. break;
  1577. }
  1578. }
  1579. return BaseT::getIntrinsicInstrCost(ICA, CostKind);
  1580. }
  1581. bool ARMTTIImpl::isLoweredToCall(const Function *F) {
  1582. if (!F->isIntrinsic())
  1583. BaseT::isLoweredToCall(F);
  1584. // Assume all Arm-specific intrinsics map to an instruction.
  1585. if (F->getName().startswith("llvm.arm"))
  1586. return false;
  1587. switch (F->getIntrinsicID()) {
  1588. default: break;
  1589. case Intrinsic::powi:
  1590. case Intrinsic::sin:
  1591. case Intrinsic::cos:
  1592. case Intrinsic::pow:
  1593. case Intrinsic::log:
  1594. case Intrinsic::log10:
  1595. case Intrinsic::log2:
  1596. case Intrinsic::exp:
  1597. case Intrinsic::exp2:
  1598. return true;
  1599. case Intrinsic::sqrt:
  1600. case Intrinsic::fabs:
  1601. case Intrinsic::copysign:
  1602. case Intrinsic::floor:
  1603. case Intrinsic::ceil:
  1604. case Intrinsic::trunc:
  1605. case Intrinsic::rint:
  1606. case Intrinsic::nearbyint:
  1607. case Intrinsic::round:
  1608. case Intrinsic::canonicalize:
  1609. case Intrinsic::lround:
  1610. case Intrinsic::llround:
  1611. case Intrinsic::lrint:
  1612. case Intrinsic::llrint:
  1613. if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
  1614. return true;
  1615. if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
  1616. return true;
  1617. // Some operations can be handled by vector instructions and assume
  1618. // unsupported vectors will be expanded into supported scalar ones.
  1619. // TODO Handle scalar operations properly.
  1620. return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
  1621. case Intrinsic::masked_store:
  1622. case Intrinsic::masked_load:
  1623. case Intrinsic::masked_gather:
  1624. case Intrinsic::masked_scatter:
  1625. return !ST->hasMVEIntegerOps();
  1626. case Intrinsic::sadd_with_overflow:
  1627. case Intrinsic::uadd_with_overflow:
  1628. case Intrinsic::ssub_with_overflow:
  1629. case Intrinsic::usub_with_overflow:
  1630. case Intrinsic::sadd_sat:
  1631. case Intrinsic::uadd_sat:
  1632. case Intrinsic::ssub_sat:
  1633. case Intrinsic::usub_sat:
  1634. return false;
  1635. }
  1636. return BaseT::isLoweredToCall(F);
  1637. }
  1638. bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
  1639. unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
  1640. EVT VT = TLI->getValueType(DL, I.getType(), true);
  1641. if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
  1642. return true;
  1643. // Check if an intrinsic will be lowered to a call and assume that any
  1644. // other CallInst will generate a bl.
  1645. if (auto *Call = dyn_cast<CallInst>(&I)) {
  1646. if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
  1647. switch(II->getIntrinsicID()) {
  1648. case Intrinsic::memcpy:
  1649. case Intrinsic::memset:
  1650. case Intrinsic::memmove:
  1651. return getNumMemOps(II) == -1;
  1652. default:
  1653. if (const Function *F = Call->getCalledFunction())
  1654. return isLoweredToCall(F);
  1655. }
  1656. }
  1657. return true;
  1658. }
  1659. // FPv5 provides conversions between integer, double-precision,
  1660. // single-precision, and half-precision formats.
  1661. switch (I.getOpcode()) {
  1662. default:
  1663. break;
  1664. case Instruction::FPToSI:
  1665. case Instruction::FPToUI:
  1666. case Instruction::SIToFP:
  1667. case Instruction::UIToFP:
  1668. case Instruction::FPTrunc:
  1669. case Instruction::FPExt:
  1670. return !ST->hasFPARMv8Base();
  1671. }
  1672. // FIXME: Unfortunately the approach of checking the Operation Action does
  1673. // not catch all cases of Legalization that use library calls. Our
  1674. // Legalization step categorizes some transformations into library calls as
  1675. // Custom, Expand or even Legal when doing type legalization. So for now
  1676. // we have to special case for instance the SDIV of 64bit integers and the
  1677. // use of floating point emulation.
  1678. if (VT.isInteger() && VT.getSizeInBits() >= 64) {
  1679. switch (ISD) {
  1680. default:
  1681. break;
  1682. case ISD::SDIV:
  1683. case ISD::UDIV:
  1684. case ISD::SREM:
  1685. case ISD::UREM:
  1686. case ISD::SDIVREM:
  1687. case ISD::UDIVREM:
  1688. return true;
  1689. }
  1690. }
  1691. // Assume all other non-float operations are supported.
  1692. if (!VT.isFloatingPoint())
  1693. return false;
  1694. // We'll need a library call to handle most floats when using soft.
  1695. if (TLI->useSoftFloat()) {
  1696. switch (I.getOpcode()) {
  1697. default:
  1698. return true;
  1699. case Instruction::Alloca:
  1700. case Instruction::Load:
  1701. case Instruction::Store:
  1702. case Instruction::Select:
  1703. case Instruction::PHI:
  1704. return false;
  1705. }
  1706. }
  1707. // We'll need a libcall to perform double precision operations on a single
  1708. // precision only FPU.
  1709. if (I.getType()->isDoubleTy() && !ST->hasFP64())
  1710. return true;
  1711. // Likewise for half precision arithmetic.
  1712. if (I.getType()->isHalfTy() && !ST->hasFullFP16())
  1713. return true;
  1714. return false;
  1715. }
  1716. bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
  1717. AssumptionCache &AC,
  1718. TargetLibraryInfo *LibInfo,
  1719. HardwareLoopInfo &HWLoopInfo) {
  1720. // Low-overhead branches are only supported in the 'low-overhead branch'
  1721. // extension of v8.1-m.
  1722. if (!ST->hasLOB() || DisableLowOverheadLoops) {
  1723. LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
  1724. return false;
  1725. }
  1726. if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
  1727. LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
  1728. return false;
  1729. }
  1730. const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
  1731. if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
  1732. LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
  1733. return false;
  1734. }
  1735. const SCEV *TripCountSCEV =
  1736. SE.getAddExpr(BackedgeTakenCount,
  1737. SE.getOne(BackedgeTakenCount->getType()));
  1738. // We need to store the trip count in LR, a 32-bit register.
  1739. if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
  1740. LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
  1741. return false;
  1742. }
  1743. // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
  1744. // point in generating a hardware loop if that's going to happen.
  1745. auto IsHardwareLoopIntrinsic = [](Instruction &I) {
  1746. if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
  1747. switch (Call->getIntrinsicID()) {
  1748. default:
  1749. break;
  1750. case Intrinsic::start_loop_iterations:
  1751. case Intrinsic::test_start_loop_iterations:
  1752. case Intrinsic::loop_decrement:
  1753. case Intrinsic::loop_decrement_reg:
  1754. return true;
  1755. }
  1756. }
  1757. return false;
  1758. };
  1759. // Scan the instructions to see if there's any that we know will turn into a
  1760. // call or if this loop is already a low-overhead loop or will become a tail
  1761. // predicated loop.
  1762. bool IsTailPredLoop = false;
  1763. auto ScanLoop = [&](Loop *L) {
  1764. for (auto *BB : L->getBlocks()) {
  1765. for (auto &I : *BB) {
  1766. if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
  1767. isa<InlineAsm>(I)) {
  1768. LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
  1769. return false;
  1770. }
  1771. if (auto *II = dyn_cast<IntrinsicInst>(&I))
  1772. IsTailPredLoop |=
  1773. II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
  1774. II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
  1775. II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
  1776. II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
  1777. II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
  1778. }
  1779. }
  1780. return true;
  1781. };
  1782. // Visit inner loops.
  1783. for (auto Inner : *L)
  1784. if (!ScanLoop(Inner))
  1785. return false;
  1786. if (!ScanLoop(L))
  1787. return false;
  1788. // TODO: Check whether the trip count calculation is expensive. If L is the
  1789. // inner loop but we know it has a low trip count, calculating that trip
  1790. // count (in the parent loop) may be detrimental.
  1791. LLVMContext &C = L->getHeader()->getContext();
  1792. HWLoopInfo.CounterInReg = true;
  1793. HWLoopInfo.IsNestingLegal = false;
  1794. HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
  1795. HWLoopInfo.CountType = Type::getInt32Ty(C);
  1796. HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
  1797. return true;
  1798. }
  1799. static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
  1800. // We don't allow icmp's, and because we only look at single block loops,
  1801. // we simply count the icmps, i.e. there should only be 1 for the backedge.
  1802. if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
  1803. return false;
  1804. // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
  1805. // not currently canonical, but soon will be. Code without them uses icmp, and
  1806. // so is not tail predicated as per the condition above. In order to get the
  1807. // same performance we treat min and max the same as an icmp for tailpred
  1808. // purposes for the moment (we often rely on non-tailpred and higher VF's to
  1809. // pick more optimial instructions like VQDMULH. They need to be recognized
  1810. // directly by the vectorizer).
  1811. if (auto *II = dyn_cast<IntrinsicInst>(&I))
  1812. if ((II->getIntrinsicID() == Intrinsic::smin ||
  1813. II->getIntrinsicID() == Intrinsic::smax ||
  1814. II->getIntrinsicID() == Intrinsic::umin ||
  1815. II->getIntrinsicID() == Intrinsic::umax) &&
  1816. ++ICmpCount > 1)
  1817. return false;
  1818. if (isa<FCmpInst>(&I))
  1819. return false;
  1820. // We could allow extending/narrowing FP loads/stores, but codegen is
  1821. // too inefficient so reject this for now.
  1822. if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
  1823. return false;
  1824. // Extends have to be extending-loads
  1825. if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
  1826. if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
  1827. return false;
  1828. // Truncs have to be narrowing-stores
  1829. if (isa<TruncInst>(&I) )
  1830. if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
  1831. return false;
  1832. return true;
  1833. }
  1834. // To set up a tail-predicated loop, we need to know the total number of
  1835. // elements processed by that loop. Thus, we need to determine the element
  1836. // size and:
  1837. // 1) it should be uniform for all operations in the vector loop, so we
  1838. // e.g. don't want any widening/narrowing operations.
  1839. // 2) it should be smaller than i64s because we don't have vector operations
  1840. // that work on i64s.
  1841. // 3) we don't want elements to be reversed or shuffled, to make sure the
  1842. // tail-predication masks/predicates the right lanes.
  1843. //
  1844. static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
  1845. const DataLayout &DL,
  1846. const LoopAccessInfo *LAI) {
  1847. LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
  1848. // If there are live-out values, it is probably a reduction. We can predicate
  1849. // most reduction operations freely under MVE using a combination of
  1850. // prefer-predicated-reduction-select and inloop reductions. We limit this to
  1851. // floating point and integer reductions, but don't check for operators
  1852. // specifically here. If the value ends up not being a reduction (and so the
  1853. // vectorizer cannot tailfold the loop), we should fall back to standard
  1854. // vectorization automatically.
  1855. SmallVector< Instruction *, 8 > LiveOuts;
  1856. LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
  1857. bool ReductionsDisabled =
  1858. EnableTailPredication == TailPredication::EnabledNoReductions ||
  1859. EnableTailPredication == TailPredication::ForceEnabledNoReductions;
  1860. for (auto *I : LiveOuts) {
  1861. if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
  1862. !I->getType()->isHalfTy()) {
  1863. LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
  1864. "live-out value\n");
  1865. return false;
  1866. }
  1867. if (ReductionsDisabled) {
  1868. LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
  1869. return false;
  1870. }
  1871. }
  1872. // Next, check that all instructions can be tail-predicated.
  1873. PredicatedScalarEvolution PSE = LAI->getPSE();
  1874. SmallVector<Instruction *, 16> LoadStores;
  1875. int ICmpCount = 0;
  1876. for (BasicBlock *BB : L->blocks()) {
  1877. for (Instruction &I : BB->instructionsWithoutDebug()) {
  1878. if (isa<PHINode>(&I))
  1879. continue;
  1880. if (!canTailPredicateInstruction(I, ICmpCount)) {
  1881. LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
  1882. return false;
  1883. }
  1884. Type *T = I.getType();
  1885. if (T->getScalarSizeInBits() > 32) {
  1886. LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
  1887. return false;
  1888. }
  1889. if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
  1890. Value *Ptr = getLoadStorePointerOperand(&I);
  1891. Type *AccessTy = getLoadStoreType(&I);
  1892. int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L);
  1893. if (NextStride == 1) {
  1894. // TODO: for now only allow consecutive strides of 1. We could support
  1895. // other strides as long as it is uniform, but let's keep it simple
  1896. // for now.
  1897. continue;
  1898. } else if (NextStride == -1 ||
  1899. (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
  1900. (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
  1901. LLVM_DEBUG(dbgs()
  1902. << "Consecutive strides of 2 found, vld2/vstr2 can't "
  1903. "be tail-predicated\n.");
  1904. return false;
  1905. // TODO: don't tail predicate if there is a reversed load?
  1906. } else if (EnableMaskedGatherScatters) {
  1907. // Gather/scatters do allow loading from arbitrary strides, at
  1908. // least if they are loop invariant.
  1909. // TODO: Loop variant strides should in theory work, too, but
  1910. // this requires further testing.
  1911. const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
  1912. if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
  1913. const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
  1914. if (PSE.getSE()->isLoopInvariant(Step, L))
  1915. continue;
  1916. }
  1917. }
  1918. LLVM_DEBUG(dbgs() << "Bad stride found, can't "
  1919. "tail-predicate\n.");
  1920. return false;
  1921. }
  1922. }
  1923. }
  1924. LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
  1925. return true;
  1926. }
  1927. bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
  1928. ScalarEvolution &SE,
  1929. AssumptionCache &AC,
  1930. TargetLibraryInfo *TLI,
  1931. DominatorTree *DT,
  1932. const LoopAccessInfo *LAI) {
  1933. if (!EnableTailPredication) {
  1934. LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
  1935. return false;
  1936. }
  1937. // Creating a predicated vector loop is the first step for generating a
  1938. // tail-predicated hardware loop, for which we need the MVE masked
  1939. // load/stores instructions:
  1940. if (!ST->hasMVEIntegerOps())
  1941. return false;
  1942. // For now, restrict this to single block loops.
  1943. if (L->getNumBlocks() > 1) {
  1944. LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
  1945. "loop.\n");
  1946. return false;
  1947. }
  1948. assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
  1949. HardwareLoopInfo HWLoopInfo(L);
  1950. if (!HWLoopInfo.canAnalyze(*LI)) {
  1951. LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
  1952. "analyzable.\n");
  1953. return false;
  1954. }
  1955. // This checks if we have the low-overhead branch architecture
  1956. // extension, and if we will create a hardware-loop:
  1957. if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
  1958. LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
  1959. "profitable.\n");
  1960. return false;
  1961. }
  1962. if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
  1963. LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
  1964. "a candidate.\n");
  1965. return false;
  1966. }
  1967. return canTailPredicateLoop(L, LI, SE, DL, LAI);
  1968. }
  1969. bool ARMTTIImpl::emitGetActiveLaneMask() const {
  1970. if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
  1971. return false;
  1972. // Intrinsic @llvm.get.active.lane.mask is supported.
  1973. // It is used in the MVETailPredication pass, which requires the number of
  1974. // elements processed by this vector loop to setup the tail-predicated
  1975. // loop.
  1976. return true;
  1977. }
  1978. void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
  1979. TTI::UnrollingPreferences &UP,
  1980. OptimizationRemarkEmitter *ORE) {
  1981. // Enable Upper bound unrolling universally, not dependant upon the conditions
  1982. // below.
  1983. UP.UpperBound = true;
  1984. // Only currently enable these preferences for M-Class cores.
  1985. if (!ST->isMClass())
  1986. return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
  1987. // Disable loop unrolling for Oz and Os.
  1988. UP.OptSizeThreshold = 0;
  1989. UP.PartialOptSizeThreshold = 0;
  1990. if (L->getHeader()->getParent()->hasOptSize())
  1991. return;
  1992. SmallVector<BasicBlock*, 4> ExitingBlocks;
  1993. L->getExitingBlocks(ExitingBlocks);
  1994. LLVM_DEBUG(dbgs() << "Loop has:\n"
  1995. << "Blocks: " << L->getNumBlocks() << "\n"
  1996. << "Exit blocks: " << ExitingBlocks.size() << "\n");
  1997. // Only allow another exit other than the latch. This acts as an early exit
  1998. // as it mirrors the profitability calculation of the runtime unroller.
  1999. if (ExitingBlocks.size() > 2)
  2000. return;
  2001. // Limit the CFG of the loop body for targets with a branch predictor.
  2002. // Allowing 4 blocks permits if-then-else diamonds in the body.
  2003. if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
  2004. return;
  2005. // Don't unroll vectorized loops, including the remainder loop
  2006. if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
  2007. return;
  2008. // Scan the loop: don't unroll loops with calls as this could prevent
  2009. // inlining.
  2010. InstructionCost Cost = 0;
  2011. for (auto *BB : L->getBlocks()) {
  2012. for (auto &I : *BB) {
  2013. // Don't unroll vectorised loop. MVE does not benefit from it as much as
  2014. // scalar code.
  2015. if (I.getType()->isVectorTy())
  2016. return;
  2017. if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
  2018. if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
  2019. if (!isLoweredToCall(F))
  2020. continue;
  2021. }
  2022. return;
  2023. }
  2024. SmallVector<const Value*, 4> Operands(I.operand_values());
  2025. Cost +=
  2026. getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency);
  2027. }
  2028. }
  2029. // On v6m cores, there are very few registers available. We can easily end up
  2030. // spilling and reloading more registers in an unrolled loop. Look at the
  2031. // number of LCSSA phis as a rough measure of how many registers will need to
  2032. // be live out of the loop, reducing the default unroll count if more than 1
  2033. // value is needed. In the long run, all of this should be being learnt by a
  2034. // machine.
  2035. unsigned UnrollCount = 4;
  2036. if (ST->isThumb1Only()) {
  2037. unsigned ExitingValues = 0;
  2038. SmallVector<BasicBlock *, 4> ExitBlocks;
  2039. L->getExitBlocks(ExitBlocks);
  2040. for (auto *Exit : ExitBlocks) {
  2041. // Count the number of LCSSA phis. Exclude values coming from GEP's as
  2042. // only the last is expected to be needed for address operands.
  2043. unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
  2044. return PH.getNumOperands() != 1 ||
  2045. !isa<GetElementPtrInst>(PH.getOperand(0));
  2046. });
  2047. ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
  2048. }
  2049. if (ExitingValues)
  2050. UnrollCount /= ExitingValues;
  2051. if (UnrollCount <= 1)
  2052. return;
  2053. }
  2054. LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
  2055. LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
  2056. UP.Partial = true;
  2057. UP.Runtime = true;
  2058. UP.UnrollRemainder = true;
  2059. UP.DefaultUnrollRuntimeCount = UnrollCount;
  2060. UP.UnrollAndJam = true;
  2061. UP.UnrollAndJamInnerLoopThreshold = 60;
  2062. // Force unrolling small loops can be very useful because of the branch
  2063. // taken cost of the backedge.
  2064. if (Cost < 12)
  2065. UP.Force = true;
  2066. }
  2067. void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
  2068. TTI::PeelingPreferences &PP) {
  2069. BaseT::getPeelingPreferences(L, SE, PP);
  2070. }
  2071. bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
  2072. TTI::ReductionFlags Flags) const {
  2073. if (!ST->hasMVEIntegerOps())
  2074. return false;
  2075. unsigned ScalarBits = Ty->getScalarSizeInBits();
  2076. switch (Opcode) {
  2077. case Instruction::Add:
  2078. return ScalarBits <= 64;
  2079. default:
  2080. return false;
  2081. }
  2082. }
  2083. bool ARMTTIImpl::preferPredicatedReductionSelect(
  2084. unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
  2085. if (!ST->hasMVEIntegerOps())
  2086. return false;
  2087. return true;
  2088. }