SelectOptimize.cpp 42 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045
  1. //===--- SelectOptimize.cpp - Convert select to branches if profitable ---===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This pass converts selects to conditional jumps when profitable.
  10. //
  11. //===----------------------------------------------------------------------===//
  12. #include "llvm/ADT/SmallVector.h"
  13. #include "llvm/ADT/Statistic.h"
  14. #include "llvm/Analysis/BlockFrequencyInfo.h"
  15. #include "llvm/Analysis/BranchProbabilityInfo.h"
  16. #include "llvm/Analysis/LoopInfo.h"
  17. #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  18. #include "llvm/Analysis/ProfileSummaryInfo.h"
  19. #include "llvm/Analysis/TargetTransformInfo.h"
  20. #include "llvm/CodeGen/Passes.h"
  21. #include "llvm/CodeGen/TargetLowering.h"
  22. #include "llvm/CodeGen/TargetPassConfig.h"
  23. #include "llvm/CodeGen/TargetSchedule.h"
  24. #include "llvm/CodeGen/TargetSubtargetInfo.h"
  25. #include "llvm/IR/BasicBlock.h"
  26. #include "llvm/IR/Dominators.h"
  27. #include "llvm/IR/Function.h"
  28. #include "llvm/IR/IRBuilder.h"
  29. #include "llvm/IR/Instruction.h"
  30. #include "llvm/IR/ProfDataUtils.h"
  31. #include "llvm/InitializePasses.h"
  32. #include "llvm/Pass.h"
  33. #include "llvm/Support/ScaledNumber.h"
  34. #include "llvm/Target/TargetMachine.h"
  35. #include "llvm/Transforms/Utils/SizeOpts.h"
  36. #include <algorithm>
  37. #include <memory>
  38. #include <queue>
  39. #include <stack>
  40. #include <string>
  41. using namespace llvm;
  42. #define DEBUG_TYPE "select-optimize"
  43. STATISTIC(NumSelectOptAnalyzed,
  44. "Number of select groups considered for conversion to branch");
  45. STATISTIC(NumSelectConvertedExpColdOperand,
  46. "Number of select groups converted due to expensive cold operand");
  47. STATISTIC(NumSelectConvertedHighPred,
  48. "Number of select groups converted due to high-predictability");
  49. STATISTIC(NumSelectUnPred,
  50. "Number of select groups not converted due to unpredictability");
  51. STATISTIC(NumSelectColdBB,
  52. "Number of select groups not converted due to cold basic block");
  53. STATISTIC(NumSelectConvertedLoop,
  54. "Number of select groups converted due to loop-level analysis");
  55. STATISTIC(NumSelectsConverted, "Number of selects converted");
  56. static cl::opt<unsigned> ColdOperandThreshold(
  57. "cold-operand-threshold",
  58. cl::desc("Maximum frequency of path for an operand to be considered cold."),
  59. cl::init(20), cl::Hidden);
  60. static cl::opt<unsigned> ColdOperandMaxCostMultiplier(
  61. "cold-operand-max-cost-multiplier",
  62. cl::desc("Maximum cost multiplier of TCC_expensive for the dependence "
  63. "slice of a cold operand to be considered inexpensive."),
  64. cl::init(1), cl::Hidden);
  65. static cl::opt<unsigned>
  66. GainGradientThreshold("select-opti-loop-gradient-gain-threshold",
  67. cl::desc("Gradient gain threshold (%)."),
  68. cl::init(25), cl::Hidden);
  69. static cl::opt<unsigned>
  70. GainCycleThreshold("select-opti-loop-cycle-gain-threshold",
  71. cl::desc("Minimum gain per loop (in cycles) threshold."),
  72. cl::init(4), cl::Hidden);
  73. static cl::opt<unsigned> GainRelativeThreshold(
  74. "select-opti-loop-relative-gain-threshold",
  75. cl::desc(
  76. "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"),
  77. cl::init(8), cl::Hidden);
  78. static cl::opt<unsigned> MispredictDefaultRate(
  79. "mispredict-default-rate", cl::Hidden, cl::init(25),
  80. cl::desc("Default mispredict rate (initialized to 25%)."));
  81. static cl::opt<bool>
  82. DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden,
  83. cl::init(false),
  84. cl::desc("Disable loop-level heuristics."));
  85. namespace {
  86. class SelectOptimize : public FunctionPass {
  87. const TargetMachine *TM = nullptr;
  88. const TargetSubtargetInfo *TSI;
  89. const TargetLowering *TLI = nullptr;
  90. const TargetTransformInfo *TTI = nullptr;
  91. const LoopInfo *LI;
  92. DominatorTree *DT;
  93. std::unique_ptr<BlockFrequencyInfo> BFI;
  94. std::unique_ptr<BranchProbabilityInfo> BPI;
  95. ProfileSummaryInfo *PSI;
  96. OptimizationRemarkEmitter *ORE;
  97. TargetSchedModel TSchedModel;
  98. public:
  99. static char ID;
  100. SelectOptimize() : FunctionPass(ID) {
  101. initializeSelectOptimizePass(*PassRegistry::getPassRegistry());
  102. }
  103. bool runOnFunction(Function &F) override;
  104. void getAnalysisUsage(AnalysisUsage &AU) const override {
  105. AU.addRequired<ProfileSummaryInfoWrapperPass>();
  106. AU.addRequired<TargetPassConfig>();
  107. AU.addRequired<TargetTransformInfoWrapperPass>();
  108. AU.addRequired<DominatorTreeWrapperPass>();
  109. AU.addRequired<LoopInfoWrapperPass>();
  110. AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
  111. }
  112. private:
  113. // Select groups consist of consecutive select instructions with the same
  114. // condition.
  115. using SelectGroup = SmallVector<SelectInst *, 2>;
  116. using SelectGroups = SmallVector<SelectGroup, 2>;
  117. using Scaled64 = ScaledNumber<uint64_t>;
  118. struct CostInfo {
  119. /// Predicated cost (with selects as conditional moves).
  120. Scaled64 PredCost;
  121. /// Non-predicated cost (with selects converted to branches).
  122. Scaled64 NonPredCost;
  123. };
  124. // Converts select instructions of a function to conditional jumps when deemed
  125. // profitable. Returns true if at least one select was converted.
  126. bool optimizeSelects(Function &F);
  127. // Heuristics for determining which select instructions can be profitably
  128. // conveted to branches. Separate heuristics for selects in inner-most loops
  129. // and the rest of code regions (base heuristics for non-inner-most loop
  130. // regions).
  131. void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups);
  132. void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups);
  133. // Converts to branches the select groups that were deemed
  134. // profitable-to-convert.
  135. void convertProfitableSIGroups(SelectGroups &ProfSIGroups);
  136. // Splits selects of a given basic block into select groups.
  137. void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups);
  138. // Determines for which select groups it is profitable converting to branches
  139. // (base and inner-most-loop heuristics).
  140. void findProfitableSIGroupsBase(SelectGroups &SIGroups,
  141. SelectGroups &ProfSIGroups);
  142. void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups,
  143. SelectGroups &ProfSIGroups);
  144. // Determines if a select group should be converted to a branch (base
  145. // heuristics).
  146. bool isConvertToBranchProfitableBase(const SmallVector<SelectInst *, 2> &ASI);
  147. // Returns true if there are expensive instructions in the cold value
  148. // operand's (if any) dependence slice of any of the selects of the given
  149. // group.
  150. bool hasExpensiveColdOperand(const SmallVector<SelectInst *, 2> &ASI);
  151. // For a given source instruction, collect its backwards dependence slice
  152. // consisting of instructions exclusively computed for producing the operands
  153. // of the source instruction.
  154. void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice,
  155. Instruction *SI, bool ForSinking = false);
  156. // Returns true if the condition of the select is highly predictable.
  157. bool isSelectHighlyPredictable(const SelectInst *SI);
  158. // Loop-level checks to determine if a non-predicated version (with branches)
  159. // of the given loop is more profitable than its predicated version.
  160. bool checkLoopHeuristics(const Loop *L, const CostInfo LoopDepth[2]);
  161. // Computes instruction and loop-critical-path costs for both the predicated
  162. // and non-predicated version of the given loop.
  163. bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups,
  164. DenseMap<const Instruction *, CostInfo> &InstCostMap,
  165. CostInfo *LoopCost);
  166. // Returns a set of all the select instructions in the given select groups.
  167. SmallPtrSet<const Instruction *, 2> getSIset(const SelectGroups &SIGroups);
  168. // Returns the latency cost of a given instruction.
  169. std::optional<uint64_t> computeInstCost(const Instruction *I);
  170. // Returns the misprediction cost of a given select when converted to branch.
  171. Scaled64 getMispredictionCost(const SelectInst *SI, const Scaled64 CondCost);
  172. // Returns the cost of a branch when the prediction is correct.
  173. Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
  174. const SelectInst *SI);
  175. // Returns true if the target architecture supports lowering a given select.
  176. bool isSelectKindSupported(SelectInst *SI);
  177. };
  178. } // namespace
  179. char SelectOptimize::ID = 0;
  180. INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
  181. false)
  182. INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
  183. INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
  184. INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
  185. INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
  186. INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
  187. INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
  188. INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
  189. false)
  190. FunctionPass *llvm::createSelectOptimizePass() { return new SelectOptimize(); }
  191. bool SelectOptimize::runOnFunction(Function &F) {
  192. TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
  193. TSI = TM->getSubtargetImpl(F);
  194. TLI = TSI->getTargetLowering();
  195. // If none of the select types is supported then skip this pass.
  196. // This is an optimization pass. Legality issues will be handled by
  197. // instruction selection.
  198. if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) &&
  199. !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) &&
  200. !TLI->isSelectSupported(TargetLowering::VectorMaskSelect))
  201. return false;
  202. TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
  203. if (!TTI->enableSelectOptimize())
  204. return false;
  205. DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  206. LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  207. BPI.reset(new BranchProbabilityInfo(F, *LI));
  208. BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
  209. PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
  210. ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
  211. TSchedModel.init(TSI);
  212. // When optimizing for size, selects are preferable over branches.
  213. if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI.get()))
  214. return false;
  215. return optimizeSelects(F);
  216. }
  217. bool SelectOptimize::optimizeSelects(Function &F) {
  218. // Determine for which select groups it is profitable converting to branches.
  219. SelectGroups ProfSIGroups;
  220. // Base heuristics apply only to non-loops and outer loops.
  221. optimizeSelectsBase(F, ProfSIGroups);
  222. // Separate heuristics for inner-most loops.
  223. optimizeSelectsInnerLoops(F, ProfSIGroups);
  224. // Convert to branches the select groups that were deemed
  225. // profitable-to-convert.
  226. convertProfitableSIGroups(ProfSIGroups);
  227. // Code modified if at least one select group was converted.
  228. return !ProfSIGroups.empty();
  229. }
  230. void SelectOptimize::optimizeSelectsBase(Function &F,
  231. SelectGroups &ProfSIGroups) {
  232. // Collect all the select groups.
  233. SelectGroups SIGroups;
  234. for (BasicBlock &BB : F) {
  235. // Base heuristics apply only to non-loops and outer loops.
  236. Loop *L = LI->getLoopFor(&BB);
  237. if (L && L->isInnermost())
  238. continue;
  239. collectSelectGroups(BB, SIGroups);
  240. }
  241. // Determine for which select groups it is profitable converting to branches.
  242. findProfitableSIGroupsBase(SIGroups, ProfSIGroups);
  243. }
  244. void SelectOptimize::optimizeSelectsInnerLoops(Function &F,
  245. SelectGroups &ProfSIGroups) {
  246. SmallVector<Loop *, 4> Loops(LI->begin(), LI->end());
  247. // Need to check size on each iteration as we accumulate child loops.
  248. for (unsigned long i = 0; i < Loops.size(); ++i)
  249. for (Loop *ChildL : Loops[i]->getSubLoops())
  250. Loops.push_back(ChildL);
  251. for (Loop *L : Loops) {
  252. if (!L->isInnermost())
  253. continue;
  254. SelectGroups SIGroups;
  255. for (BasicBlock *BB : L->getBlocks())
  256. collectSelectGroups(*BB, SIGroups);
  257. findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups);
  258. }
  259. }
  260. /// If \p isTrue is true, return the true value of \p SI, otherwise return
  261. /// false value of \p SI. If the true/false value of \p SI is defined by any
  262. /// select instructions in \p Selects, look through the defining select
  263. /// instruction until the true/false value is not defined in \p Selects.
  264. static Value *
  265. getTrueOrFalseValue(SelectInst *SI, bool isTrue,
  266. const SmallPtrSet<const Instruction *, 2> &Selects) {
  267. Value *V = nullptr;
  268. for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI);
  269. DefSI = dyn_cast<SelectInst>(V)) {
  270. assert(DefSI->getCondition() == SI->getCondition() &&
  271. "The condition of DefSI does not match with SI");
  272. V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
  273. }
  274. assert(V && "Failed to get select true/false value");
  275. return V;
  276. }
  277. void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
  278. for (SelectGroup &ASI : ProfSIGroups) {
  279. // The code transformation here is a modified version of the sinking
  280. // transformation in CodeGenPrepare::optimizeSelectInst with a more
  281. // aggressive strategy of which instructions to sink.
  282. //
  283. // TODO: eliminate the redundancy of logic transforming selects to branches
  284. // by removing CodeGenPrepare::optimizeSelectInst and optimizing here
  285. // selects for all cases (with and without profile information).
  286. // Transform a sequence like this:
  287. // start:
  288. // %cmp = cmp uge i32 %a, %b
  289. // %sel = select i1 %cmp, i32 %c, i32 %d
  290. //
  291. // Into:
  292. // start:
  293. // %cmp = cmp uge i32 %a, %b
  294. // %cmp.frozen = freeze %cmp
  295. // br i1 %cmp.frozen, label %select.true, label %select.false
  296. // select.true:
  297. // br label %select.end
  298. // select.false:
  299. // br label %select.end
  300. // select.end:
  301. // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
  302. //
  303. // %cmp should be frozen, otherwise it may introduce undefined behavior.
  304. // In addition, we may sink instructions that produce %c or %d into the
  305. // destination(s) of the new branch.
  306. // If the true or false blocks do not contain a sunken instruction, that
  307. // block and its branch may be optimized away. In that case, one side of the
  308. // first branch will point directly to select.end, and the corresponding PHI
  309. // predecessor block will be the start block.
  310. // Find all the instructions that can be soundly sunk to the true/false
  311. // blocks. These are instructions that are computed solely for producing the
  312. // operands of the select instructions in the group and can be sunk without
  313. // breaking the semantics of the LLVM IR (e.g., cannot sink instructions
  314. // with side effects).
  315. SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices;
  316. typedef std::stack<Instruction *>::size_type StackSizeType;
  317. StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0;
  318. for (SelectInst *SI : ASI) {
  319. // For each select, compute the sinkable dependence chains of the true and
  320. // false operands.
  321. if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue())) {
  322. std::stack<Instruction *> TrueSlice;
  323. getExclBackwardsSlice(TI, TrueSlice, SI, true);
  324. maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size());
  325. TrueSlices.push_back(TrueSlice);
  326. }
  327. if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue())) {
  328. std::stack<Instruction *> FalseSlice;
  329. getExclBackwardsSlice(FI, FalseSlice, SI, true);
  330. maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size());
  331. FalseSlices.push_back(FalseSlice);
  332. }
  333. }
  334. // In the case of multiple select instructions in the same group, the order
  335. // of non-dependent instructions (instructions of different dependence
  336. // slices) in the true/false blocks appears to affect performance.
  337. // Interleaving the slices seems to experimentally be the optimal approach.
  338. // This interleaving scheduling allows for more ILP (with a natural downside
  339. // of increasing a bit register pressure) compared to a simple ordering of
  340. // one whole chain after another. One would expect that this ordering would
  341. // not matter since the scheduling in the backend of the compiler would
  342. // take care of it, but apparently the scheduler fails to deliver optimal
  343. // ILP with a naive ordering here.
  344. SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved;
  345. for (StackSizeType IS = 0; IS < maxTrueSliceLen; ++IS) {
  346. for (auto &S : TrueSlices) {
  347. if (!S.empty()) {
  348. TrueSlicesInterleaved.push_back(S.top());
  349. S.pop();
  350. }
  351. }
  352. }
  353. for (StackSizeType IS = 0; IS < maxFalseSliceLen; ++IS) {
  354. for (auto &S : FalseSlices) {
  355. if (!S.empty()) {
  356. FalseSlicesInterleaved.push_back(S.top());
  357. S.pop();
  358. }
  359. }
  360. }
  361. // We split the block containing the select(s) into two blocks.
  362. SelectInst *SI = ASI.front();
  363. SelectInst *LastSI = ASI.back();
  364. BasicBlock *StartBlock = SI->getParent();
  365. BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI));
  366. BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
  367. BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency());
  368. // Delete the unconditional branch that was just created by the split.
  369. StartBlock->getTerminator()->eraseFromParent();
  370. // Move any debug/pseudo instructions that were in-between the select
  371. // group to the newly-created end block.
  372. SmallVector<Instruction *, 2> DebugPseudoINS;
  373. auto DIt = SI->getIterator();
  374. while (&*DIt != LastSI) {
  375. if (DIt->isDebugOrPseudoInst())
  376. DebugPseudoINS.push_back(&*DIt);
  377. DIt++;
  378. }
  379. for (auto *DI : DebugPseudoINS) {
  380. DI->moveBefore(&*EndBlock->getFirstInsertionPt());
  381. }
  382. // These are the new basic blocks for the conditional branch.
  383. // At least one will become an actual new basic block.
  384. BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr;
  385. BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr;
  386. if (!TrueSlicesInterleaved.empty()) {
  387. TrueBlock = BasicBlock::Create(LastSI->getContext(), "select.true.sink",
  388. EndBlock->getParent(), EndBlock);
  389. TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
  390. TrueBranch->setDebugLoc(LastSI->getDebugLoc());
  391. for (Instruction *TrueInst : TrueSlicesInterleaved)
  392. TrueInst->moveBefore(TrueBranch);
  393. }
  394. if (!FalseSlicesInterleaved.empty()) {
  395. FalseBlock = BasicBlock::Create(LastSI->getContext(), "select.false.sink",
  396. EndBlock->getParent(), EndBlock);
  397. FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
  398. FalseBranch->setDebugLoc(LastSI->getDebugLoc());
  399. for (Instruction *FalseInst : FalseSlicesInterleaved)
  400. FalseInst->moveBefore(FalseBranch);
  401. }
  402. // If there was nothing to sink, then arbitrarily choose the 'false' side
  403. // for a new input value to the PHI.
  404. if (TrueBlock == FalseBlock) {
  405. assert(TrueBlock == nullptr &&
  406. "Unexpected basic block transform while optimizing select");
  407. FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
  408. EndBlock->getParent(), EndBlock);
  409. auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
  410. FalseBranch->setDebugLoc(SI->getDebugLoc());
  411. }
  412. // Insert the real conditional branch based on the original condition.
  413. // If we did not create a new block for one of the 'true' or 'false' paths
  414. // of the condition, it means that side of the branch goes to the end block
  415. // directly and the path originates from the start block from the point of
  416. // view of the new PHI.
  417. BasicBlock *TT, *FT;
  418. if (TrueBlock == nullptr) {
  419. TT = EndBlock;
  420. FT = FalseBlock;
  421. TrueBlock = StartBlock;
  422. } else if (FalseBlock == nullptr) {
  423. TT = TrueBlock;
  424. FT = EndBlock;
  425. FalseBlock = StartBlock;
  426. } else {
  427. TT = TrueBlock;
  428. FT = FalseBlock;
  429. }
  430. IRBuilder<> IB(SI);
  431. auto *CondFr =
  432. IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen");
  433. IB.CreateCondBr(CondFr, TT, FT, SI);
  434. SmallPtrSet<const Instruction *, 2> INS;
  435. INS.insert(ASI.begin(), ASI.end());
  436. // Use reverse iterator because later select may use the value of the
  437. // earlier select, and we need to propagate value through earlier select
  438. // to get the PHI operand.
  439. for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
  440. SelectInst *SI = *It;
  441. // The select itself is replaced with a PHI Node.
  442. PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front());
  443. PN->takeName(SI);
  444. PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock);
  445. PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock);
  446. PN->setDebugLoc(SI->getDebugLoc());
  447. SI->replaceAllUsesWith(PN);
  448. SI->eraseFromParent();
  449. INS.erase(SI);
  450. ++NumSelectsConverted;
  451. }
  452. }
  453. }
  454. static bool isSpecialSelect(SelectInst *SI) {
  455. using namespace llvm::PatternMatch;
  456. // If the select is a logical-and/logical-or then it is better treated as a
  457. // and/or by the backend.
  458. if (match(SI, m_CombineOr(m_LogicalAnd(m_Value(), m_Value()),
  459. m_LogicalOr(m_Value(), m_Value()))))
  460. return true;
  461. return false;
  462. }
  463. void SelectOptimize::collectSelectGroups(BasicBlock &BB,
  464. SelectGroups &SIGroups) {
  465. BasicBlock::iterator BBIt = BB.begin();
  466. while (BBIt != BB.end()) {
  467. Instruction *I = &*BBIt++;
  468. if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
  469. if (isSpecialSelect(SI))
  470. continue;
  471. SelectGroup SIGroup;
  472. SIGroup.push_back(SI);
  473. while (BBIt != BB.end()) {
  474. Instruction *NI = &*BBIt;
  475. SelectInst *NSI = dyn_cast<SelectInst>(NI);
  476. if (NSI && SI->getCondition() == NSI->getCondition()) {
  477. SIGroup.push_back(NSI);
  478. } else if (!NI->isDebugOrPseudoInst()) {
  479. // Debug/pseudo instructions should be skipped and not prevent the
  480. // formation of a select group.
  481. break;
  482. }
  483. ++BBIt;
  484. }
  485. // If the select type is not supported, no point optimizing it.
  486. // Instruction selection will take care of it.
  487. if (!isSelectKindSupported(SI))
  488. continue;
  489. SIGroups.push_back(SIGroup);
  490. }
  491. }
  492. }
  493. void SelectOptimize::findProfitableSIGroupsBase(SelectGroups &SIGroups,
  494. SelectGroups &ProfSIGroups) {
  495. for (SelectGroup &ASI : SIGroups) {
  496. ++NumSelectOptAnalyzed;
  497. if (isConvertToBranchProfitableBase(ASI))
  498. ProfSIGroups.push_back(ASI);
  499. }
  500. }
  501. static void EmitAndPrintRemark(OptimizationRemarkEmitter *ORE,
  502. DiagnosticInfoOptimizationBase &Rem) {
  503. LLVM_DEBUG(dbgs() << Rem.getMsg() << "\n");
  504. ORE->emit(Rem);
  505. }
  506. void SelectOptimize::findProfitableSIGroupsInnerLoops(
  507. const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
  508. NumSelectOptAnalyzed += SIGroups.size();
  509. // For each select group in an inner-most loop,
  510. // a branch is more preferable than a select/conditional-move if:
  511. // i) conversion to branches for all the select groups of the loop satisfies
  512. // loop-level heuristics including reducing the loop's critical path by
  513. // some threshold (see SelectOptimize::checkLoopHeuristics); and
  514. // ii) the total cost of the select group is cheaper with a branch compared
  515. // to its predicated version. The cost is in terms of latency and the cost
  516. // of a select group is the cost of its most expensive select instruction
  517. // (assuming infinite resources and thus fully leveraging available ILP).
  518. DenseMap<const Instruction *, CostInfo> InstCostMap;
  519. CostInfo LoopCost[2] = {{Scaled64::getZero(), Scaled64::getZero()},
  520. {Scaled64::getZero(), Scaled64::getZero()}};
  521. if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) ||
  522. !checkLoopHeuristics(L, LoopCost)) {
  523. return;
  524. }
  525. for (SelectGroup &ASI : SIGroups) {
  526. // Assuming infinite resources, the cost of a group of instructions is the
  527. // cost of the most expensive instruction of the group.
  528. Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero();
  529. for (SelectInst *SI : ASI) {
  530. SelectCost = std::max(SelectCost, InstCostMap[SI].PredCost);
  531. BranchCost = std::max(BranchCost, InstCostMap[SI].NonPredCost);
  532. }
  533. if (BranchCost < SelectCost) {
  534. OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", ASI.front());
  535. OR << "Profitable to convert to branch (loop analysis). BranchCost="
  536. << BranchCost.toString() << ", SelectCost=" << SelectCost.toString()
  537. << ". ";
  538. EmitAndPrintRemark(ORE, OR);
  539. ++NumSelectConvertedLoop;
  540. ProfSIGroups.push_back(ASI);
  541. } else {
  542. OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front());
  543. ORmiss << "Select is more profitable (loop analysis). BranchCost="
  544. << BranchCost.toString()
  545. << ", SelectCost=" << SelectCost.toString() << ". ";
  546. EmitAndPrintRemark(ORE, ORmiss);
  547. }
  548. }
  549. }
  550. bool SelectOptimize::isConvertToBranchProfitableBase(
  551. const SmallVector<SelectInst *, 2> &ASI) {
  552. SelectInst *SI = ASI.front();
  553. LLVM_DEBUG(dbgs() << "Analyzing select group containing " << *SI << "\n");
  554. OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI);
  555. OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI);
  556. // Skip cold basic blocks. Better to optimize for size for cold blocks.
  557. if (PSI->isColdBlock(SI->getParent(), BFI.get())) {
  558. ++NumSelectColdBB;
  559. ORmiss << "Not converted to branch because of cold basic block. ";
  560. EmitAndPrintRemark(ORE, ORmiss);
  561. return false;
  562. }
  563. // If unpredictable, branch form is less profitable.
  564. if (SI->getMetadata(LLVMContext::MD_unpredictable)) {
  565. ++NumSelectUnPred;
  566. ORmiss << "Not converted to branch because of unpredictable branch. ";
  567. EmitAndPrintRemark(ORE, ORmiss);
  568. return false;
  569. }
  570. // If highly predictable, branch form is more profitable, unless a
  571. // predictable select is inexpensive in the target architecture.
  572. if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) {
  573. ++NumSelectConvertedHighPred;
  574. OR << "Converted to branch because of highly predictable branch. ";
  575. EmitAndPrintRemark(ORE, OR);
  576. return true;
  577. }
  578. // Look for expensive instructions in the cold operand's (if any) dependence
  579. // slice of any of the selects in the group.
  580. if (hasExpensiveColdOperand(ASI)) {
  581. ++NumSelectConvertedExpColdOperand;
  582. OR << "Converted to branch because of expensive cold operand.";
  583. EmitAndPrintRemark(ORE, OR);
  584. return true;
  585. }
  586. ORmiss << "Not profitable to convert to branch (base heuristic).";
  587. EmitAndPrintRemark(ORE, ORmiss);
  588. return false;
  589. }
  590. static InstructionCost divideNearest(InstructionCost Numerator,
  591. uint64_t Denominator) {
  592. return (Numerator + (Denominator / 2)) / Denominator;
  593. }
  594. bool SelectOptimize::hasExpensiveColdOperand(
  595. const SmallVector<SelectInst *, 2> &ASI) {
  596. bool ColdOperand = false;
  597. uint64_t TrueWeight, FalseWeight, TotalWeight;
  598. if (extractBranchWeights(*ASI.front(), TrueWeight, FalseWeight)) {
  599. uint64_t MinWeight = std::min(TrueWeight, FalseWeight);
  600. TotalWeight = TrueWeight + FalseWeight;
  601. // Is there a path with frequency <ColdOperandThreshold% (default:20%) ?
  602. ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight;
  603. } else if (PSI->hasProfileSummary()) {
  604. OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front());
  605. ORmiss << "Profile data available but missing branch-weights metadata for "
  606. "select instruction. ";
  607. EmitAndPrintRemark(ORE, ORmiss);
  608. }
  609. if (!ColdOperand)
  610. return false;
  611. // Check if the cold path's dependence slice is expensive for any of the
  612. // selects of the group.
  613. for (SelectInst *SI : ASI) {
  614. Instruction *ColdI = nullptr;
  615. uint64_t HotWeight;
  616. if (TrueWeight < FalseWeight) {
  617. ColdI = dyn_cast<Instruction>(SI->getTrueValue());
  618. HotWeight = FalseWeight;
  619. } else {
  620. ColdI = dyn_cast<Instruction>(SI->getFalseValue());
  621. HotWeight = TrueWeight;
  622. }
  623. if (ColdI) {
  624. std::stack<Instruction *> ColdSlice;
  625. getExclBackwardsSlice(ColdI, ColdSlice, SI);
  626. InstructionCost SliceCost = 0;
  627. while (!ColdSlice.empty()) {
  628. SliceCost += TTI->getInstructionCost(ColdSlice.top(),
  629. TargetTransformInfo::TCK_Latency);
  630. ColdSlice.pop();
  631. }
  632. // The colder the cold value operand of the select is the more expensive
  633. // the cmov becomes for computing the cold value operand every time. Thus,
  634. // the colder the cold operand is the more its cost counts.
  635. // Get nearest integer cost adjusted for coldness.
  636. InstructionCost AdjSliceCost =
  637. divideNearest(SliceCost * HotWeight, TotalWeight);
  638. if (AdjSliceCost >=
  639. ColdOperandMaxCostMultiplier * TargetTransformInfo::TCC_Expensive)
  640. return true;
  641. }
  642. }
  643. return false;
  644. }
  645. // Check if it is safe to move LoadI next to the SI.
  646. // Conservatively assume it is safe only if there is no instruction
  647. // modifying memory in-between the load and the select instruction.
  648. static bool isSafeToSinkLoad(Instruction *LoadI, Instruction *SI) {
  649. // Assume loads from different basic blocks are unsafe to move.
  650. if (LoadI->getParent() != SI->getParent())
  651. return false;
  652. auto It = LoadI->getIterator();
  653. while (&*It != SI) {
  654. if (It->mayWriteToMemory())
  655. return false;
  656. It++;
  657. }
  658. return true;
  659. }
  660. // For a given source instruction, collect its backwards dependence slice
  661. // consisting of instructions exclusively computed for the purpose of producing
  662. // the operands of the source instruction. As an approximation
  663. // (sufficiently-accurate in practice), we populate this set with the
  664. // instructions of the backwards dependence slice that only have one-use and
  665. // form an one-use chain that leads to the source instruction.
  666. void SelectOptimize::getExclBackwardsSlice(Instruction *I,
  667. std::stack<Instruction *> &Slice,
  668. Instruction *SI, bool ForSinking) {
  669. SmallPtrSet<Instruction *, 2> Visited;
  670. std::queue<Instruction *> Worklist;
  671. Worklist.push(I);
  672. while (!Worklist.empty()) {
  673. Instruction *II = Worklist.front();
  674. Worklist.pop();
  675. // Avoid cycles.
  676. if (!Visited.insert(II).second)
  677. continue;
  678. if (!II->hasOneUse())
  679. continue;
  680. // Cannot soundly sink instructions with side-effects.
  681. // Terminator or phi instructions cannot be sunk.
  682. // Avoid sinking other select instructions (should be handled separetely).
  683. if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() ||
  684. isa<SelectInst>(II) || isa<PHINode>(II)))
  685. continue;
  686. // Avoid sinking loads in order not to skip state-modifying instructions,
  687. // that may alias with the loaded address.
  688. // Only allow sinking of loads within the same basic block that are
  689. // conservatively proven to be safe.
  690. if (ForSinking && II->mayReadFromMemory() && !isSafeToSinkLoad(II, SI))
  691. continue;
  692. // Avoid considering instructions with less frequency than the source
  693. // instruction (i.e., avoid colder code regions of the dependence slice).
  694. if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent()))
  695. continue;
  696. // Eligible one-use instruction added to the dependence slice.
  697. Slice.push(II);
  698. // Explore all the operands of the current instruction to expand the slice.
  699. for (unsigned k = 0; k < II->getNumOperands(); ++k)
  700. if (auto *OpI = dyn_cast<Instruction>(II->getOperand(k)))
  701. Worklist.push(OpI);
  702. }
  703. }
  704. bool SelectOptimize::isSelectHighlyPredictable(const SelectInst *SI) {
  705. uint64_t TrueWeight, FalseWeight;
  706. if (extractBranchWeights(*SI, TrueWeight, FalseWeight)) {
  707. uint64_t Max = std::max(TrueWeight, FalseWeight);
  708. uint64_t Sum = TrueWeight + FalseWeight;
  709. if (Sum != 0) {
  710. auto Probability = BranchProbability::getBranchProbability(Max, Sum);
  711. if (Probability > TTI->getPredictableBranchThreshold())
  712. return true;
  713. }
  714. }
  715. return false;
  716. }
  717. bool SelectOptimize::checkLoopHeuristics(const Loop *L,
  718. const CostInfo LoopCost[2]) {
  719. // Loop-level checks to determine if a non-predicated version (with branches)
  720. // of the loop is more profitable than its predicated version.
  721. if (DisableLoopLevelHeuristics)
  722. return true;
  723. OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti",
  724. L->getHeader()->getFirstNonPHI());
  725. if (LoopCost[0].NonPredCost > LoopCost[0].PredCost ||
  726. LoopCost[1].NonPredCost >= LoopCost[1].PredCost) {
  727. ORmissL << "No select conversion in the loop due to no reduction of loop's "
  728. "critical path. ";
  729. EmitAndPrintRemark(ORE, ORmissL);
  730. return false;
  731. }
  732. Scaled64 Gain[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost,
  733. LoopCost[1].PredCost - LoopCost[1].NonPredCost};
  734. // Profitably converting to branches need to reduce the loop's critical path
  735. // by at least some threshold (absolute gain of GainCycleThreshold cycles and
  736. // relative gain of 12.5%).
  737. if (Gain[1] < Scaled64::get(GainCycleThreshold) ||
  738. Gain[1] * Scaled64::get(GainRelativeThreshold) < LoopCost[1].PredCost) {
  739. Scaled64 RelativeGain = Scaled64::get(100) * Gain[1] / LoopCost[1].PredCost;
  740. ORmissL << "No select conversion in the loop due to small reduction of "
  741. "loop's critical path. Gain="
  742. << Gain[1].toString()
  743. << ", RelativeGain=" << RelativeGain.toString() << "%. ";
  744. EmitAndPrintRemark(ORE, ORmissL);
  745. return false;
  746. }
  747. // If the loop's critical path involves loop-carried dependences, the gradient
  748. // of the gain needs to be at least GainGradientThreshold% (defaults to 25%).
  749. // This check ensures that the latency reduction for the loop's critical path
  750. // keeps decreasing with sufficient rate beyond the two analyzed loop
  751. // iterations.
  752. if (Gain[1] > Gain[0]) {
  753. Scaled64 GradientGain = Scaled64::get(100) * (Gain[1] - Gain[0]) /
  754. (LoopCost[1].PredCost - LoopCost[0].PredCost);
  755. if (GradientGain < Scaled64::get(GainGradientThreshold)) {
  756. ORmissL << "No select conversion in the loop due to small gradient gain. "
  757. "GradientGain="
  758. << GradientGain.toString() << "%. ";
  759. EmitAndPrintRemark(ORE, ORmissL);
  760. return false;
  761. }
  762. }
  763. // If the gain decreases it is not profitable to convert.
  764. else if (Gain[1] < Gain[0]) {
  765. ORmissL
  766. << "No select conversion in the loop due to negative gradient gain. ";
  767. EmitAndPrintRemark(ORE, ORmissL);
  768. return false;
  769. }
  770. // Non-predicated version of the loop is more profitable than its
  771. // predicated version.
  772. return true;
  773. }
  774. // Computes instruction and loop-critical-path costs for both the predicated
  775. // and non-predicated version of the given loop.
  776. // Returns false if unable to compute these costs due to invalid cost of loop
  777. // instruction(s).
  778. bool SelectOptimize::computeLoopCosts(
  779. const Loop *L, const SelectGroups &SIGroups,
  780. DenseMap<const Instruction *, CostInfo> &InstCostMap, CostInfo *LoopCost) {
  781. LLVM_DEBUG(dbgs() << "Calculating Latency / IPredCost / INonPredCost of loop "
  782. << L->getHeader()->getName() << "\n");
  783. const auto &SIset = getSIset(SIGroups);
  784. // Compute instruction and loop-critical-path costs across two iterations for
  785. // both predicated and non-predicated version.
  786. const unsigned Iterations = 2;
  787. for (unsigned Iter = 0; Iter < Iterations; ++Iter) {
  788. // Cost of the loop's critical path.
  789. CostInfo &MaxCost = LoopCost[Iter];
  790. for (BasicBlock *BB : L->getBlocks()) {
  791. for (const Instruction &I : *BB) {
  792. if (I.isDebugOrPseudoInst())
  793. continue;
  794. // Compute the predicated and non-predicated cost of the instruction.
  795. Scaled64 IPredCost = Scaled64::getZero(),
  796. INonPredCost = Scaled64::getZero();
  797. // Assume infinite resources that allow to fully exploit the available
  798. // instruction-level parallelism.
  799. // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost)
  800. for (const Use &U : I.operands()) {
  801. auto UI = dyn_cast<Instruction>(U.get());
  802. if (!UI)
  803. continue;
  804. if (InstCostMap.count(UI)) {
  805. IPredCost = std::max(IPredCost, InstCostMap[UI].PredCost);
  806. INonPredCost = std::max(INonPredCost, InstCostMap[UI].NonPredCost);
  807. }
  808. }
  809. auto ILatency = computeInstCost(&I);
  810. if (!ILatency) {
  811. OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", &I);
  812. ORmissL << "Invalid instruction cost preventing analysis and "
  813. "optimization of the inner-most loop containing this "
  814. "instruction. ";
  815. EmitAndPrintRemark(ORE, ORmissL);
  816. return false;
  817. }
  818. IPredCost += Scaled64::get(*ILatency);
  819. INonPredCost += Scaled64::get(*ILatency);
  820. // For a select that can be converted to branch,
  821. // compute its cost as a branch (non-predicated cost).
  822. //
  823. // BranchCost = PredictedPathCost + MispredictCost
  824. // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb
  825. // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate
  826. if (SIset.contains(&I)) {
  827. auto SI = cast<SelectInst>(&I);
  828. Scaled64 TrueOpCost = Scaled64::getZero(),
  829. FalseOpCost = Scaled64::getZero();
  830. if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue()))
  831. if (InstCostMap.count(TI))
  832. TrueOpCost = InstCostMap[TI].NonPredCost;
  833. if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue()))
  834. if (InstCostMap.count(FI))
  835. FalseOpCost = InstCostMap[FI].NonPredCost;
  836. Scaled64 PredictedPathCost =
  837. getPredictedPathCost(TrueOpCost, FalseOpCost, SI);
  838. Scaled64 CondCost = Scaled64::getZero();
  839. if (auto *CI = dyn_cast<Instruction>(SI->getCondition()))
  840. if (InstCostMap.count(CI))
  841. CondCost = InstCostMap[CI].NonPredCost;
  842. Scaled64 MispredictCost = getMispredictionCost(SI, CondCost);
  843. INonPredCost = PredictedPathCost + MispredictCost;
  844. }
  845. LLVM_DEBUG(dbgs() << " " << ILatency << "/" << IPredCost << "/"
  846. << INonPredCost << " for " << I << "\n");
  847. InstCostMap[&I] = {IPredCost, INonPredCost};
  848. MaxCost.PredCost = std::max(MaxCost.PredCost, IPredCost);
  849. MaxCost.NonPredCost = std::max(MaxCost.NonPredCost, INonPredCost);
  850. }
  851. }
  852. LLVM_DEBUG(dbgs() << "Iteration " << Iter + 1
  853. << " MaxCost = " << MaxCost.PredCost << " "
  854. << MaxCost.NonPredCost << "\n");
  855. }
  856. return true;
  857. }
  858. SmallPtrSet<const Instruction *, 2>
  859. SelectOptimize::getSIset(const SelectGroups &SIGroups) {
  860. SmallPtrSet<const Instruction *, 2> SIset;
  861. for (const SelectGroup &ASI : SIGroups)
  862. for (const SelectInst *SI : ASI)
  863. SIset.insert(SI);
  864. return SIset;
  865. }
  866. std::optional<uint64_t> SelectOptimize::computeInstCost(const Instruction *I) {
  867. InstructionCost ICost =
  868. TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency);
  869. if (auto OC = ICost.getValue())
  870. return std::optional<uint64_t>(*OC);
  871. return std::nullopt;
  872. }
  873. ScaledNumber<uint64_t>
  874. SelectOptimize::getMispredictionCost(const SelectInst *SI,
  875. const Scaled64 CondCost) {
  876. uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
  877. // Account for the default misprediction rate when using a branch
  878. // (conservatively set to 25% by default).
  879. uint64_t MispredictRate = MispredictDefaultRate;
  880. // If the select condition is obviously predictable, then the misprediction
  881. // rate is zero.
  882. if (isSelectHighlyPredictable(SI))
  883. MispredictRate = 0;
  884. // CondCost is included to account for cases where the computation of the
  885. // condition is part of a long dependence chain (potentially loop-carried)
  886. // that would delay detection of a misprediction and increase its cost.
  887. Scaled64 MispredictCost =
  888. std::max(Scaled64::get(MispredictPenalty), CondCost) *
  889. Scaled64::get(MispredictRate);
  890. MispredictCost /= Scaled64::get(100);
  891. return MispredictCost;
  892. }
  893. // Returns the cost of a branch when the prediction is correct.
  894. // TrueCost * TrueProbability + FalseCost * FalseProbability.
  895. ScaledNumber<uint64_t>
  896. SelectOptimize::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
  897. const SelectInst *SI) {
  898. Scaled64 PredPathCost;
  899. uint64_t TrueWeight, FalseWeight;
  900. if (extractBranchWeights(*SI, TrueWeight, FalseWeight)) {
  901. uint64_t SumWeight = TrueWeight + FalseWeight;
  902. if (SumWeight != 0) {
  903. PredPathCost = TrueCost * Scaled64::get(TrueWeight) +
  904. FalseCost * Scaled64::get(FalseWeight);
  905. PredPathCost /= Scaled64::get(SumWeight);
  906. return PredPathCost;
  907. }
  908. }
  909. // Without branch weight metadata, we assume 75% for the one path and 25% for
  910. // the other, and pick the result with the biggest cost.
  911. PredPathCost = std::max(TrueCost * Scaled64::get(3) + FalseCost,
  912. FalseCost * Scaled64::get(3) + TrueCost);
  913. PredPathCost /= Scaled64::get(4);
  914. return PredPathCost;
  915. }
  916. bool SelectOptimize::isSelectKindSupported(SelectInst *SI) {
  917. bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
  918. if (VectorCond)
  919. return false;
  920. TargetLowering::SelectSupportKind SelectKind;
  921. if (SI->getType()->isVectorTy())
  922. SelectKind = TargetLowering::ScalarCondVectorVal;
  923. else
  924. SelectKind = TargetLowering::ScalarValSelect;
  925. return TLI->isSelectSupported(SelectKind);
  926. }