BypassSlowDivision.cpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480
  1. //===- BypassSlowDivision.cpp - Bypass slow division ----------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file contains an optimization for div and rem on architectures that
  10. // execute short instructions significantly faster than longer instructions.
  11. // For example, on Intel Atom 32-bit divides are slow enough that during
  12. // runtime it is profitable to check the value of the operands, and if they are
  13. // positive and less than 256 use an unsigned 8-bit divide.
  14. //
  15. //===----------------------------------------------------------------------===//
  16. #include "llvm/Transforms/Utils/BypassSlowDivision.h"
  17. #include "llvm/ADT/DenseMap.h"
  18. #include "llvm/ADT/STLExtras.h"
  19. #include "llvm/ADT/SmallPtrSet.h"
  20. #include "llvm/Transforms/Utils/Local.h"
  21. #include "llvm/Analysis/ValueTracking.h"
  22. #include "llvm/IR/BasicBlock.h"
  23. #include "llvm/IR/Constants.h"
  24. #include "llvm/IR/DerivedTypes.h"
  25. #include "llvm/IR/Function.h"
  26. #include "llvm/IR/IRBuilder.h"
  27. #include "llvm/IR/Instruction.h"
  28. #include "llvm/IR/Instructions.h"
  29. #include "llvm/IR/Module.h"
  30. #include "llvm/IR/Type.h"
  31. #include "llvm/IR/Value.h"
  32. #include "llvm/Support/Casting.h"
  33. #include "llvm/Support/KnownBits.h"
  34. #include <cassert>
  35. #include <cstdint>
  36. using namespace llvm;
  37. #define DEBUG_TYPE "bypass-slow-division"
  38. namespace {
  39. struct QuotRemPair {
  40. Value *Quotient;
  41. Value *Remainder;
  42. QuotRemPair(Value *InQuotient, Value *InRemainder)
  43. : Quotient(InQuotient), Remainder(InRemainder) {}
  44. };
  45. /// A quotient and remainder, plus a BB from which they logically "originate".
  46. /// If you use Quotient or Remainder in a Phi node, you should use BB as its
  47. /// corresponding predecessor.
  48. struct QuotRemWithBB {
  49. BasicBlock *BB = nullptr;
  50. Value *Quotient = nullptr;
  51. Value *Remainder = nullptr;
  52. };
  53. using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>;
  54. using BypassWidthsTy = DenseMap<unsigned, unsigned>;
  55. using VisitedSetTy = SmallPtrSet<Instruction *, 4>;
  56. enum ValueRange {
  57. /// Operand definitely fits into BypassType. No runtime checks are needed.
  58. VALRNG_KNOWN_SHORT,
  59. /// A runtime check is required, as value range is unknown.
  60. VALRNG_UNKNOWN,
  61. /// Operand is unlikely to fit into BypassType. The bypassing should be
  62. /// disabled.
  63. VALRNG_LIKELY_LONG
  64. };
  65. class FastDivInsertionTask {
  66. bool IsValidTask = false;
  67. Instruction *SlowDivOrRem = nullptr;
  68. IntegerType *BypassType = nullptr;
  69. BasicBlock *MainBB = nullptr;
  70. bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
  71. ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
  72. QuotRemWithBB createSlowBB(BasicBlock *Successor);
  73. QuotRemWithBB createFastBB(BasicBlock *Successor);
  74. QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
  75. BasicBlock *PhiBB);
  76. Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
  77. std::optional<QuotRemPair> insertFastDivAndRem();
  78. bool isSignedOp() {
  79. return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
  80. SlowDivOrRem->getOpcode() == Instruction::SRem;
  81. }
  82. bool isDivisionOp() {
  83. return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
  84. SlowDivOrRem->getOpcode() == Instruction::UDiv;
  85. }
  86. Type *getSlowType() { return SlowDivOrRem->getType(); }
  87. public:
  88. FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
  89. Value *getReplacement(DivCacheTy &Cache);
  90. };
  91. } // end anonymous namespace
  92. FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
  93. const BypassWidthsTy &BypassWidths) {
  94. switch (I->getOpcode()) {
  95. case Instruction::UDiv:
  96. case Instruction::SDiv:
  97. case Instruction::URem:
  98. case Instruction::SRem:
  99. SlowDivOrRem = I;
  100. break;
  101. default:
  102. // I is not a div/rem operation.
  103. return;
  104. }
  105. // Skip division on vector types. Only optimize integer instructions.
  106. IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType());
  107. if (!SlowType)
  108. return;
  109. // Skip if this bitwidth is not bypassed.
  110. auto BI = BypassWidths.find(SlowType->getBitWidth());
  111. if (BI == BypassWidths.end())
  112. return;
  113. // Get type for div/rem instruction with bypass bitwidth.
  114. IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
  115. BypassType = BT;
  116. // The original basic block.
  117. MainBB = I->getParent();
  118. // The instruction is indeed a slow div or rem operation.
  119. IsValidTask = true;
  120. }
  121. /// Reuses previously-computed dividend or remainder from the current BB if
  122. /// operands and operation are identical. Otherwise calls insertFastDivAndRem to
  123. /// perform the optimization and caches the resulting dividend and remainder.
  124. /// If no replacement can be generated, nullptr is returned.
  125. Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
  126. // First, make sure that the task is valid.
  127. if (!IsValidTask)
  128. return nullptr;
  129. // Then, look for a value in Cache.
  130. Value *Dividend = SlowDivOrRem->getOperand(0);
  131. Value *Divisor = SlowDivOrRem->getOperand(1);
  132. DivRemMapKey Key(isSignedOp(), Dividend, Divisor);
  133. auto CacheI = Cache.find(Key);
  134. if (CacheI == Cache.end()) {
  135. // If previous instance does not exist, try to insert fast div.
  136. std::optional<QuotRemPair> OptResult = insertFastDivAndRem();
  137. // Bail out if insertFastDivAndRem has failed.
  138. if (!OptResult)
  139. return nullptr;
  140. CacheI = Cache.insert({Key, *OptResult}).first;
  141. }
  142. QuotRemPair &Value = CacheI->second;
  143. return isDivisionOp() ? Value.Quotient : Value.Remainder;
  144. }
  145. /// Check if a value looks like a hash.
  146. ///
  147. /// The routine is expected to detect values computed using the most common hash
  148. /// algorithms. Typically, hash computations end with one of the following
  149. /// instructions:
  150. ///
  151. /// 1) MUL with a constant wider than BypassType
  152. /// 2) XOR instruction
  153. ///
  154. /// And even if we are wrong and the value is not a hash, it is still quite
  155. /// unlikely that such values will fit into BypassType.
  156. ///
  157. /// To detect string hash algorithms like FNV we have to look through PHI-nodes.
  158. /// It is implemented as a depth-first search for values that look neither long
  159. /// nor hash-like.
  160. bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) {
  161. Instruction *I = dyn_cast<Instruction>(V);
  162. if (!I)
  163. return false;
  164. switch (I->getOpcode()) {
  165. case Instruction::Xor:
  166. return true;
  167. case Instruction::Mul: {
  168. // After Constant Hoisting pass, long constants may be represented as
  169. // bitcast instructions. As a result, some constants may look like an
  170. // instruction at first, and an additional check is necessary to find out if
  171. // an operand is actually a constant.
  172. Value *Op1 = I->getOperand(1);
  173. ConstantInt *C = dyn_cast<ConstantInt>(Op1);
  174. if (!C && isa<BitCastInst>(Op1))
  175. C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0));
  176. return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth();
  177. }
  178. case Instruction::PHI:
  179. // Stop IR traversal in case of a crazy input code. This limits recursion
  180. // depth.
  181. if (Visited.size() >= 16)
  182. return false;
  183. // Do not visit nodes that have been visited already. We return true because
  184. // it means that we couldn't find any value that doesn't look hash-like.
  185. if (!Visited.insert(I).second)
  186. return true;
  187. return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) {
  188. // Ignore undef values as they probably don't affect the division
  189. // operands.
  190. return getValueRange(V, Visited) == VALRNG_LIKELY_LONG ||
  191. isa<UndefValue>(V);
  192. });
  193. default:
  194. return false;
  195. }
  196. }
  197. /// Check if an integer value fits into our bypass type.
  198. ValueRange FastDivInsertionTask::getValueRange(Value *V,
  199. VisitedSetTy &Visited) {
  200. unsigned ShortLen = BypassType->getBitWidth();
  201. unsigned LongLen = V->getType()->getIntegerBitWidth();
  202. assert(LongLen > ShortLen && "Value type must be wider than BypassType");
  203. unsigned HiBits = LongLen - ShortLen;
  204. const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout();
  205. KnownBits Known(LongLen);
  206. computeKnownBits(V, Known, DL);
  207. if (Known.countMinLeadingZeros() >= HiBits)
  208. return VALRNG_KNOWN_SHORT;
  209. if (Known.countMaxLeadingZeros() < HiBits)
  210. return VALRNG_LIKELY_LONG;
  211. // Long integer divisions are often used in hashtable implementations. It's
  212. // not worth bypassing such divisions because hash values are extremely
  213. // unlikely to have enough leading zeros. The call below tries to detect
  214. // values that are unlikely to fit BypassType (including hashes).
  215. if (isHashLikeValue(V, Visited))
  216. return VALRNG_LIKELY_LONG;
  217. return VALRNG_UNKNOWN;
  218. }
  219. /// Add new basic block for slow div and rem operations and put it before
  220. /// SuccessorBB.
  221. QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
  222. QuotRemWithBB DivRemPair;
  223. DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
  224. MainBB->getParent(), SuccessorBB);
  225. IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
  226. Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
  227. Value *Dividend = SlowDivOrRem->getOperand(0);
  228. Value *Divisor = SlowDivOrRem->getOperand(1);
  229. if (isSignedOp()) {
  230. DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor);
  231. DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor);
  232. } else {
  233. DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor);
  234. DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor);
  235. }
  236. Builder.CreateBr(SuccessorBB);
  237. return DivRemPair;
  238. }
  239. /// Add new basic block for fast div and rem operations and put it before
  240. /// SuccessorBB.
  241. QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
  242. QuotRemWithBB DivRemPair;
  243. DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
  244. MainBB->getParent(), SuccessorBB);
  245. IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
  246. Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
  247. Value *Dividend = SlowDivOrRem->getOperand(0);
  248. Value *Divisor = SlowDivOrRem->getOperand(1);
  249. Value *ShortDivisorV =
  250. Builder.CreateCast(Instruction::Trunc, Divisor, BypassType);
  251. Value *ShortDividendV =
  252. Builder.CreateCast(Instruction::Trunc, Dividend, BypassType);
  253. // udiv/urem because this optimization only handles positive numbers.
  254. Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV);
  255. Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV);
  256. DivRemPair.Quotient =
  257. Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType());
  258. DivRemPair.Remainder =
  259. Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType());
  260. Builder.CreateBr(SuccessorBB);
  261. return DivRemPair;
  262. }
  263. /// Creates Phi nodes for result of Div and Rem.
  264. QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
  265. QuotRemWithBB &RHS,
  266. BasicBlock *PhiBB) {
  267. IRBuilder<> Builder(PhiBB, PhiBB->begin());
  268. Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
  269. PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2);
  270. QuoPhi->addIncoming(LHS.Quotient, LHS.BB);
  271. QuoPhi->addIncoming(RHS.Quotient, RHS.BB);
  272. PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2);
  273. RemPhi->addIncoming(LHS.Remainder, LHS.BB);
  274. RemPhi->addIncoming(RHS.Remainder, RHS.BB);
  275. return QuotRemPair(QuoPhi, RemPhi);
  276. }
  277. /// Creates a runtime check to test whether both the divisor and dividend fit
  278. /// into BypassType. The check is inserted at the end of MainBB. True return
  279. /// value means that the operands fit. Either of the operands may be NULL if it
  280. /// doesn't need a runtime check.
  281. Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
  282. assert((Op1 || Op2) && "Nothing to check");
  283. IRBuilder<> Builder(MainBB, MainBB->end());
  284. Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
  285. Value *OrV;
  286. if (Op1 && Op2)
  287. OrV = Builder.CreateOr(Op1, Op2);
  288. else
  289. OrV = Op1 ? Op1 : Op2;
  290. // BitMask is inverted to check if the operands are
  291. // larger than the bypass type
  292. uint64_t BitMask = ~BypassType->getBitMask();
  293. Value *AndV = Builder.CreateAnd(OrV, BitMask);
  294. // Compare operand values
  295. Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);
  296. return Builder.CreateICmpEQ(AndV, ZeroV);
  297. }
  298. /// Substitutes the div/rem instruction with code that checks the value of the
  299. /// operands and uses a shorter-faster div/rem instruction when possible.
  300. std::optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
  301. Value *Dividend = SlowDivOrRem->getOperand(0);
  302. Value *Divisor = SlowDivOrRem->getOperand(1);
  303. VisitedSetTy SetL;
  304. ValueRange DividendRange = getValueRange(Dividend, SetL);
  305. if (DividendRange == VALRNG_LIKELY_LONG)
  306. return std::nullopt;
  307. VisitedSetTy SetR;
  308. ValueRange DivisorRange = getValueRange(Divisor, SetR);
  309. if (DivisorRange == VALRNG_LIKELY_LONG)
  310. return std::nullopt;
  311. bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT);
  312. bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT);
  313. if (DividendShort && DivisorShort) {
  314. // If both operands are known to be short then just replace the long
  315. // division with a short one in-place. Since we're not introducing control
  316. // flow in this case, narrowing the division is always a win, even if the
  317. // divisor is a constant (and will later get replaced by a multiplication).
  318. IRBuilder<> Builder(SlowDivOrRem);
  319. Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
  320. Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
  321. Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
  322. Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
  323. Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
  324. Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
  325. return QuotRemPair(ExtDiv, ExtRem);
  326. }
  327. if (isa<ConstantInt>(Divisor)) {
  328. // If the divisor is not a constant, DAGCombiner will convert it to a
  329. // multiplication by a magic constant. It isn't clear if it is worth
  330. // introducing control flow to get a narrower multiply.
  331. return std::nullopt;
  332. }
  333. // After Constant Hoisting pass, long constants may be represented as
  334. // bitcast instructions. As a result, some constants may look like an
  335. // instruction at first, and an additional check is necessary to find out if
  336. // an operand is actually a constant.
  337. if (auto *BCI = dyn_cast<BitCastInst>(Divisor))
  338. if (BCI->getParent() == SlowDivOrRem->getParent() &&
  339. isa<ConstantInt>(BCI->getOperand(0)))
  340. return std::nullopt;
  341. IRBuilder<> Builder(MainBB, MainBB->end());
  342. Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
  343. if (DividendShort && !isSignedOp()) {
  344. // If the division is unsigned and Dividend is known to be short, then
  345. // either
  346. // 1) Divisor is less or equal to Dividend, and the result can be computed
  347. // with a short division.
  348. // 2) Divisor is greater than Dividend. In this case, no division is needed
  349. // at all: The quotient is 0 and the remainder is equal to Dividend.
  350. //
  351. // So instead of checking at runtime whether Divisor fits into BypassType,
  352. // we emit a runtime check to differentiate between these two cases. This
  353. // lets us entirely avoid a long div.
  354. // Split the basic block before the div/rem.
  355. BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
  356. // Remove the unconditional branch from MainBB to SuccessorBB.
  357. MainBB->back().eraseFromParent();
  358. QuotRemWithBB Long;
  359. Long.BB = MainBB;
  360. Long.Quotient = ConstantInt::get(getSlowType(), 0);
  361. Long.Remainder = Dividend;
  362. QuotRemWithBB Fast = createFastBB(SuccessorBB);
  363. QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
  364. Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
  365. Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
  366. return Result;
  367. } else {
  368. // General case. Create both slow and fast div/rem pairs and choose one of
  369. // them at runtime.
  370. // Split the basic block before the div/rem.
  371. BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
  372. // Remove the unconditional branch from MainBB to SuccessorBB.
  373. MainBB->back().eraseFromParent();
  374. QuotRemWithBB Fast = createFastBB(SuccessorBB);
  375. QuotRemWithBB Slow = createSlowBB(SuccessorBB);
  376. QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
  377. Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
  378. DivisorShort ? nullptr : Divisor);
  379. Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
  380. return Result;
  381. }
  382. }
  383. /// This optimization identifies DIV/REM instructions in a BB that can be
  384. /// profitably bypassed and carried out with a shorter, faster divide.
  385. bool llvm::bypassSlowDivision(BasicBlock *BB,
  386. const BypassWidthsTy &BypassWidths) {
  387. DivCacheTy PerBBDivCache;
  388. bool MadeChange = false;
  389. Instruction *Next = &*BB->begin();
  390. while (Next != nullptr) {
  391. // We may add instructions immediately after I, but we want to skip over
  392. // them.
  393. Instruction *I = Next;
  394. Next = Next->getNextNode();
  395. // Ignore dead code to save time and avoid bugs.
  396. if (I->hasNUses(0))
  397. continue;
  398. FastDivInsertionTask Task(I, BypassWidths);
  399. if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
  400. I->replaceAllUsesWith(Replacement);
  401. I->eraseFromParent();
  402. MadeChange = true;
  403. }
  404. }
  405. // Above we eagerly create divs and rems, as pairs, so that we can efficiently
  406. // create divrem machine instructions. Now erase any unused divs / rems so we
  407. // don't leave extra instructions sitting around.
  408. for (auto &KV : PerBBDivCache)
  409. for (Value *V : {KV.second.Quotient, KV.second.Remainder})
  410. RecursivelyDeleteTriviallyDeadInstructions(V);
  411. return MadeChange;
  412. }