BypassSlowDivision.cpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482
  1. //===- BypassSlowDivision.cpp - Bypass slow division ----------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file contains an optimization for div and rem on architectures that
  10. // execute short instructions significantly faster than longer instructions.
  11. // For example, on Intel Atom 32-bit divides are slow enough that during
  12. // runtime it is profitable to check the value of the operands, and if they are
  13. // positive and less than 256 use an unsigned 8-bit divide.
  14. //
  15. //===----------------------------------------------------------------------===//
  16. #include "llvm/Transforms/Utils/BypassSlowDivision.h"
  17. #include "llvm/ADT/DenseMap.h"
  18. #include "llvm/ADT/None.h"
  19. #include "llvm/ADT/Optional.h"
  20. #include "llvm/ADT/STLExtras.h"
  21. #include "llvm/ADT/SmallPtrSet.h"
  22. #include "llvm/Transforms/Utils/Local.h"
  23. #include "llvm/Analysis/ValueTracking.h"
  24. #include "llvm/IR/BasicBlock.h"
  25. #include "llvm/IR/Constants.h"
  26. #include "llvm/IR/DerivedTypes.h"
  27. #include "llvm/IR/Function.h"
  28. #include "llvm/IR/IRBuilder.h"
  29. #include "llvm/IR/Instruction.h"
  30. #include "llvm/IR/Instructions.h"
  31. #include "llvm/IR/Module.h"
  32. #include "llvm/IR/Type.h"
  33. #include "llvm/IR/Value.h"
  34. #include "llvm/Support/Casting.h"
  35. #include "llvm/Support/KnownBits.h"
  36. #include <cassert>
  37. #include <cstdint>
  38. using namespace llvm;
  39. #define DEBUG_TYPE "bypass-slow-division"
  40. namespace {
  41. struct QuotRemPair {
  42. Value *Quotient;
  43. Value *Remainder;
  44. QuotRemPair(Value *InQuotient, Value *InRemainder)
  45. : Quotient(InQuotient), Remainder(InRemainder) {}
  46. };
  47. /// A quotient and remainder, plus a BB from which they logically "originate".
  48. /// If you use Quotient or Remainder in a Phi node, you should use BB as its
  49. /// corresponding predecessor.
  50. struct QuotRemWithBB {
  51. BasicBlock *BB = nullptr;
  52. Value *Quotient = nullptr;
  53. Value *Remainder = nullptr;
  54. };
  55. using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>;
  56. using BypassWidthsTy = DenseMap<unsigned, unsigned>;
  57. using VisitedSetTy = SmallPtrSet<Instruction *, 4>;
  58. enum ValueRange {
  59. /// Operand definitely fits into BypassType. No runtime checks are needed.
  60. VALRNG_KNOWN_SHORT,
  61. /// A runtime check is required, as value range is unknown.
  62. VALRNG_UNKNOWN,
  63. /// Operand is unlikely to fit into BypassType. The bypassing should be
  64. /// disabled.
  65. VALRNG_LIKELY_LONG
  66. };
  67. class FastDivInsertionTask {
  68. bool IsValidTask = false;
  69. Instruction *SlowDivOrRem = nullptr;
  70. IntegerType *BypassType = nullptr;
  71. BasicBlock *MainBB = nullptr;
  72. bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
  73. ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
  74. QuotRemWithBB createSlowBB(BasicBlock *Successor);
  75. QuotRemWithBB createFastBB(BasicBlock *Successor);
  76. QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
  77. BasicBlock *PhiBB);
  78. Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
  79. Optional<QuotRemPair> insertFastDivAndRem();
  80. bool isSignedOp() {
  81. return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
  82. SlowDivOrRem->getOpcode() == Instruction::SRem;
  83. }
  84. bool isDivisionOp() {
  85. return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
  86. SlowDivOrRem->getOpcode() == Instruction::UDiv;
  87. }
  88. Type *getSlowType() { return SlowDivOrRem->getType(); }
  89. public:
  90. FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
  91. Value *getReplacement(DivCacheTy &Cache);
  92. };
  93. } // end anonymous namespace
  94. FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
  95. const BypassWidthsTy &BypassWidths) {
  96. switch (I->getOpcode()) {
  97. case Instruction::UDiv:
  98. case Instruction::SDiv:
  99. case Instruction::URem:
  100. case Instruction::SRem:
  101. SlowDivOrRem = I;
  102. break;
  103. default:
  104. // I is not a div/rem operation.
  105. return;
  106. }
  107. // Skip division on vector types. Only optimize integer instructions.
  108. IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType());
  109. if (!SlowType)
  110. return;
  111. // Skip if this bitwidth is not bypassed.
  112. auto BI = BypassWidths.find(SlowType->getBitWidth());
  113. if (BI == BypassWidths.end())
  114. return;
  115. // Get type for div/rem instruction with bypass bitwidth.
  116. IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
  117. BypassType = BT;
  118. // The original basic block.
  119. MainBB = I->getParent();
  120. // The instruction is indeed a slow div or rem operation.
  121. IsValidTask = true;
  122. }
  123. /// Reuses previously-computed dividend or remainder from the current BB if
  124. /// operands and operation are identical. Otherwise calls insertFastDivAndRem to
  125. /// perform the optimization and caches the resulting dividend and remainder.
  126. /// If no replacement can be generated, nullptr is returned.
  127. Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
  128. // First, make sure that the task is valid.
  129. if (!IsValidTask)
  130. return nullptr;
  131. // Then, look for a value in Cache.
  132. Value *Dividend = SlowDivOrRem->getOperand(0);
  133. Value *Divisor = SlowDivOrRem->getOperand(1);
  134. DivRemMapKey Key(isSignedOp(), Dividend, Divisor);
  135. auto CacheI = Cache.find(Key);
  136. if (CacheI == Cache.end()) {
  137. // If previous instance does not exist, try to insert fast div.
  138. Optional<QuotRemPair> OptResult = insertFastDivAndRem();
  139. // Bail out if insertFastDivAndRem has failed.
  140. if (!OptResult)
  141. return nullptr;
  142. CacheI = Cache.insert({Key, *OptResult}).first;
  143. }
  144. QuotRemPair &Value = CacheI->second;
  145. return isDivisionOp() ? Value.Quotient : Value.Remainder;
  146. }
  147. /// Check if a value looks like a hash.
  148. ///
  149. /// The routine is expected to detect values computed using the most common hash
  150. /// algorithms. Typically, hash computations end with one of the following
  151. /// instructions:
  152. ///
  153. /// 1) MUL with a constant wider than BypassType
  154. /// 2) XOR instruction
  155. ///
  156. /// And even if we are wrong and the value is not a hash, it is still quite
  157. /// unlikely that such values will fit into BypassType.
  158. ///
  159. /// To detect string hash algorithms like FNV we have to look through PHI-nodes.
  160. /// It is implemented as a depth-first search for values that look neither long
  161. /// nor hash-like.
  162. bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) {
  163. Instruction *I = dyn_cast<Instruction>(V);
  164. if (!I)
  165. return false;
  166. switch (I->getOpcode()) {
  167. case Instruction::Xor:
  168. return true;
  169. case Instruction::Mul: {
  170. // After Constant Hoisting pass, long constants may be represented as
  171. // bitcast instructions. As a result, some constants may look like an
  172. // instruction at first, and an additional check is necessary to find out if
  173. // an operand is actually a constant.
  174. Value *Op1 = I->getOperand(1);
  175. ConstantInt *C = dyn_cast<ConstantInt>(Op1);
  176. if (!C && isa<BitCastInst>(Op1))
  177. C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0));
  178. return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth();
  179. }
  180. case Instruction::PHI:
  181. // Stop IR traversal in case of a crazy input code. This limits recursion
  182. // depth.
  183. if (Visited.size() >= 16)
  184. return false;
  185. // Do not visit nodes that have been visited already. We return true because
  186. // it means that we couldn't find any value that doesn't look hash-like.
  187. if (!Visited.insert(I).second)
  188. return true;
  189. return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) {
  190. // Ignore undef values as they probably don't affect the division
  191. // operands.
  192. return getValueRange(V, Visited) == VALRNG_LIKELY_LONG ||
  193. isa<UndefValue>(V);
  194. });
  195. default:
  196. return false;
  197. }
  198. }
  199. /// Check if an integer value fits into our bypass type.
  200. ValueRange FastDivInsertionTask::getValueRange(Value *V,
  201. VisitedSetTy &Visited) {
  202. unsigned ShortLen = BypassType->getBitWidth();
  203. unsigned LongLen = V->getType()->getIntegerBitWidth();
  204. assert(LongLen > ShortLen && "Value type must be wider than BypassType");
  205. unsigned HiBits = LongLen - ShortLen;
  206. const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout();
  207. KnownBits Known(LongLen);
  208. computeKnownBits(V, Known, DL);
  209. if (Known.countMinLeadingZeros() >= HiBits)
  210. return VALRNG_KNOWN_SHORT;
  211. if (Known.countMaxLeadingZeros() < HiBits)
  212. return VALRNG_LIKELY_LONG;
  213. // Long integer divisions are often used in hashtable implementations. It's
  214. // not worth bypassing such divisions because hash values are extremely
  215. // unlikely to have enough leading zeros. The call below tries to detect
  216. // values that are unlikely to fit BypassType (including hashes).
  217. if (isHashLikeValue(V, Visited))
  218. return VALRNG_LIKELY_LONG;
  219. return VALRNG_UNKNOWN;
  220. }
  221. /// Add new basic block for slow div and rem operations and put it before
  222. /// SuccessorBB.
  223. QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
  224. QuotRemWithBB DivRemPair;
  225. DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
  226. MainBB->getParent(), SuccessorBB);
  227. IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
  228. Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
  229. Value *Dividend = SlowDivOrRem->getOperand(0);
  230. Value *Divisor = SlowDivOrRem->getOperand(1);
  231. if (isSignedOp()) {
  232. DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor);
  233. DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor);
  234. } else {
  235. DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor);
  236. DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor);
  237. }
  238. Builder.CreateBr(SuccessorBB);
  239. return DivRemPair;
  240. }
  241. /// Add new basic block for fast div and rem operations and put it before
  242. /// SuccessorBB.
  243. QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
  244. QuotRemWithBB DivRemPair;
  245. DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
  246. MainBB->getParent(), SuccessorBB);
  247. IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
  248. Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
  249. Value *Dividend = SlowDivOrRem->getOperand(0);
  250. Value *Divisor = SlowDivOrRem->getOperand(1);
  251. Value *ShortDivisorV =
  252. Builder.CreateCast(Instruction::Trunc, Divisor, BypassType);
  253. Value *ShortDividendV =
  254. Builder.CreateCast(Instruction::Trunc, Dividend, BypassType);
  255. // udiv/urem because this optimization only handles positive numbers.
  256. Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV);
  257. Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV);
  258. DivRemPair.Quotient =
  259. Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType());
  260. DivRemPair.Remainder =
  261. Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType());
  262. Builder.CreateBr(SuccessorBB);
  263. return DivRemPair;
  264. }
  265. /// Creates Phi nodes for result of Div and Rem.
  266. QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
  267. QuotRemWithBB &RHS,
  268. BasicBlock *PhiBB) {
  269. IRBuilder<> Builder(PhiBB, PhiBB->begin());
  270. Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
  271. PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2);
  272. QuoPhi->addIncoming(LHS.Quotient, LHS.BB);
  273. QuoPhi->addIncoming(RHS.Quotient, RHS.BB);
  274. PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2);
  275. RemPhi->addIncoming(LHS.Remainder, LHS.BB);
  276. RemPhi->addIncoming(RHS.Remainder, RHS.BB);
  277. return QuotRemPair(QuoPhi, RemPhi);
  278. }
  279. /// Creates a runtime check to test whether both the divisor and dividend fit
  280. /// into BypassType. The check is inserted at the end of MainBB. True return
  281. /// value means that the operands fit. Either of the operands may be NULL if it
  282. /// doesn't need a runtime check.
  283. Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
  284. assert((Op1 || Op2) && "Nothing to check");
  285. IRBuilder<> Builder(MainBB, MainBB->end());
  286. Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
  287. Value *OrV;
  288. if (Op1 && Op2)
  289. OrV = Builder.CreateOr(Op1, Op2);
  290. else
  291. OrV = Op1 ? Op1 : Op2;
  292. // BitMask is inverted to check if the operands are
  293. // larger than the bypass type
  294. uint64_t BitMask = ~BypassType->getBitMask();
  295. Value *AndV = Builder.CreateAnd(OrV, BitMask);
  296. // Compare operand values
  297. Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);
  298. return Builder.CreateICmpEQ(AndV, ZeroV);
  299. }
  300. /// Substitutes the div/rem instruction with code that checks the value of the
  301. /// operands and uses a shorter-faster div/rem instruction when possible.
  302. Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
  303. Value *Dividend = SlowDivOrRem->getOperand(0);
  304. Value *Divisor = SlowDivOrRem->getOperand(1);
  305. VisitedSetTy SetL;
  306. ValueRange DividendRange = getValueRange(Dividend, SetL);
  307. if (DividendRange == VALRNG_LIKELY_LONG)
  308. return None;
  309. VisitedSetTy SetR;
  310. ValueRange DivisorRange = getValueRange(Divisor, SetR);
  311. if (DivisorRange == VALRNG_LIKELY_LONG)
  312. return None;
  313. bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT);
  314. bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT);
  315. if (DividendShort && DivisorShort) {
  316. // If both operands are known to be short then just replace the long
  317. // division with a short one in-place. Since we're not introducing control
  318. // flow in this case, narrowing the division is always a win, even if the
  319. // divisor is a constant (and will later get replaced by a multiplication).
  320. IRBuilder<> Builder(SlowDivOrRem);
  321. Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
  322. Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
  323. Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
  324. Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
  325. Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
  326. Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
  327. return QuotRemPair(ExtDiv, ExtRem);
  328. }
  329. if (isa<ConstantInt>(Divisor)) {
  330. // If the divisor is not a constant, DAGCombiner will convert it to a
  331. // multiplication by a magic constant. It isn't clear if it is worth
  332. // introducing control flow to get a narrower multiply.
  333. return None;
  334. }
  335. // After Constant Hoisting pass, long constants may be represented as
  336. // bitcast instructions. As a result, some constants may look like an
  337. // instruction at first, and an additional check is necessary to find out if
  338. // an operand is actually a constant.
  339. if (auto *BCI = dyn_cast<BitCastInst>(Divisor))
  340. if (BCI->getParent() == SlowDivOrRem->getParent() &&
  341. isa<ConstantInt>(BCI->getOperand(0)))
  342. return None;
  343. IRBuilder<> Builder(MainBB, MainBB->end());
  344. Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
  345. if (DividendShort && !isSignedOp()) {
  346. // If the division is unsigned and Dividend is known to be short, then
  347. // either
  348. // 1) Divisor is less or equal to Dividend, and the result can be computed
  349. // with a short division.
  350. // 2) Divisor is greater than Dividend. In this case, no division is needed
  351. // at all: The quotient is 0 and the remainder is equal to Dividend.
  352. //
  353. // So instead of checking at runtime whether Divisor fits into BypassType,
  354. // we emit a runtime check to differentiate between these two cases. This
  355. // lets us entirely avoid a long div.
  356. // Split the basic block before the div/rem.
  357. BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
  358. // Remove the unconditional branch from MainBB to SuccessorBB.
  359. MainBB->getInstList().back().eraseFromParent();
  360. QuotRemWithBB Long;
  361. Long.BB = MainBB;
  362. Long.Quotient = ConstantInt::get(getSlowType(), 0);
  363. Long.Remainder = Dividend;
  364. QuotRemWithBB Fast = createFastBB(SuccessorBB);
  365. QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
  366. Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
  367. Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
  368. return Result;
  369. } else {
  370. // General case. Create both slow and fast div/rem pairs and choose one of
  371. // them at runtime.
  372. // Split the basic block before the div/rem.
  373. BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
  374. // Remove the unconditional branch from MainBB to SuccessorBB.
  375. MainBB->getInstList().back().eraseFromParent();
  376. QuotRemWithBB Fast = createFastBB(SuccessorBB);
  377. QuotRemWithBB Slow = createSlowBB(SuccessorBB);
  378. QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
  379. Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
  380. DivisorShort ? nullptr : Divisor);
  381. Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
  382. return Result;
  383. }
  384. }
  385. /// This optimization identifies DIV/REM instructions in a BB that can be
  386. /// profitably bypassed and carried out with a shorter, faster divide.
  387. bool llvm::bypassSlowDivision(BasicBlock *BB,
  388. const BypassWidthsTy &BypassWidths) {
  389. DivCacheTy PerBBDivCache;
  390. bool MadeChange = false;
  391. Instruction *Next = &*BB->begin();
  392. while (Next != nullptr) {
  393. // We may add instructions immediately after I, but we want to skip over
  394. // them.
  395. Instruction *I = Next;
  396. Next = Next->getNextNode();
  397. // Ignore dead code to save time and avoid bugs.
  398. if (I->hasNUses(0))
  399. continue;
  400. FastDivInsertionTask Task(I, BypassWidths);
  401. if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
  402. I->replaceAllUsesWith(Replacement);
  403. I->eraseFromParent();
  404. MadeChange = true;
  405. }
  406. }
  407. // Above we eagerly create divs and rems, as pairs, so that we can efficiently
  408. // create divrem machine instructions. Now erase any unused divs / rems so we
  409. // don't leave extra instructions sitting around.
  410. for (auto &KV : PerBBDivCache)
  411. for (Value *V : {KV.second.Quotient, KV.second.Remainder})
  412. RecursivelyDeleteTriviallyDeadInstructions(V);
  413. return MadeChange;
  414. }