LoopUnrollRuntime.cpp 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008
  1. //===-- UnrollLoopRuntime.cpp - Runtime Loop unrolling utilities ----------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file implements some loop unrolling utilities for loops with run-time
  10. // trip counts. See LoopUnroll.cpp for unrolling loops with compile-time
  11. // trip counts.
  12. //
  13. // The functions in this file are used to generate extra code when the
  14. // run-time trip count modulo the unroll factor is not 0. When this is the
  15. // case, we need to generate code to execute these 'left over' iterations.
  16. //
  17. // The current strategy generates an if-then-else sequence prior to the
  18. // unrolled loop to execute the 'left over' iterations before or after the
  19. // unrolled loop.
  20. //
  21. //===----------------------------------------------------------------------===//
  22. #include "llvm/ADT/Statistic.h"
  23. #include "llvm/Analysis/DomTreeUpdater.h"
  24. #include "llvm/Analysis/InstructionSimplify.h"
  25. #include "llvm/Analysis/LoopIterator.h"
  26. #include "llvm/Analysis/ScalarEvolution.h"
  27. #include "llvm/Analysis/ValueTracking.h"
  28. #include "llvm/IR/BasicBlock.h"
  29. #include "llvm/IR/Dominators.h"
  30. #include "llvm/IR/MDBuilder.h"
  31. #include "llvm/IR/Module.h"
  32. #include "llvm/IR/ProfDataUtils.h"
  33. #include "llvm/Support/CommandLine.h"
  34. #include "llvm/Support/Debug.h"
  35. #include "llvm/Support/raw_ostream.h"
  36. #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  37. #include "llvm/Transforms/Utils/Cloning.h"
  38. #include "llvm/Transforms/Utils/Local.h"
  39. #include "llvm/Transforms/Utils/LoopUtils.h"
  40. #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
  41. #include "llvm/Transforms/Utils/UnrollLoop.h"
  42. #include <algorithm>
  43. using namespace llvm;
  44. #define DEBUG_TYPE "loop-unroll"
  45. STATISTIC(NumRuntimeUnrolled,
  46. "Number of loops unrolled with run-time trip counts");
  47. static cl::opt<bool> UnrollRuntimeMultiExit(
  48. "unroll-runtime-multi-exit", cl::init(false), cl::Hidden,
  49. cl::desc("Allow runtime unrolling for loops with multiple exits, when "
  50. "epilog is generated"));
  51. static cl::opt<bool> UnrollRuntimeOtherExitPredictable(
  52. "unroll-runtime-other-exit-predictable", cl::init(false), cl::Hidden,
  53. cl::desc("Assume the non latch exit block to be predictable"));
  54. /// Connect the unrolling prolog code to the original loop.
  55. /// The unrolling prolog code contains code to execute the
  56. /// 'extra' iterations if the run-time trip count modulo the
  57. /// unroll count is non-zero.
  58. ///
  59. /// This function performs the following:
  60. /// - Create PHI nodes at prolog end block to combine values
  61. /// that exit the prolog code and jump around the prolog.
  62. /// - Add a PHI operand to a PHI node at the loop exit block
  63. /// for values that exit the prolog and go around the loop.
  64. /// - Branch around the original loop if the trip count is less
  65. /// than the unroll factor.
  66. ///
  67. static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
  68. BasicBlock *PrologExit,
  69. BasicBlock *OriginalLoopLatchExit,
  70. BasicBlock *PreHeader, BasicBlock *NewPreHeader,
  71. ValueToValueMapTy &VMap, DominatorTree *DT,
  72. LoopInfo *LI, bool PreserveLCSSA,
  73. ScalarEvolution &SE) {
  74. // Loop structure should be the following:
  75. // Preheader
  76. // PrologHeader
  77. // ...
  78. // PrologLatch
  79. // PrologExit
  80. // NewPreheader
  81. // Header
  82. // ...
  83. // Latch
  84. // LatchExit
  85. BasicBlock *Latch = L->getLoopLatch();
  86. assert(Latch && "Loop must have a latch");
  87. BasicBlock *PrologLatch = cast<BasicBlock>(VMap[Latch]);
  88. // Create a PHI node for each outgoing value from the original loop
  89. // (which means it is an outgoing value from the prolog code too).
  90. // The new PHI node is inserted in the prolog end basic block.
  91. // The new PHI node value is added as an operand of a PHI node in either
  92. // the loop header or the loop exit block.
  93. for (BasicBlock *Succ : successors(Latch)) {
  94. for (PHINode &PN : Succ->phis()) {
  95. // Add a new PHI node to the prolog end block and add the
  96. // appropriate incoming values.
  97. // TODO: This code assumes that the PrologExit (or the LatchExit block for
  98. // prolog loop) contains only one predecessor from the loop, i.e. the
  99. // PrologLatch. When supporting multiple-exiting block loops, we can have
  100. // two or more blocks that have the LatchExit as the target in the
  101. // original loop.
  102. PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr",
  103. PrologExit->getFirstNonPHI());
  104. // Adding a value to the new PHI node from the original loop preheader.
  105. // This is the value that skips all the prolog code.
  106. if (L->contains(&PN)) {
  107. // Succ is loop header.
  108. NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader),
  109. PreHeader);
  110. } else {
  111. // Succ is LatchExit.
  112. NewPN->addIncoming(UndefValue::get(PN.getType()), PreHeader);
  113. }
  114. Value *V = PN.getIncomingValueForBlock(Latch);
  115. if (Instruction *I = dyn_cast<Instruction>(V)) {
  116. if (L->contains(I)) {
  117. V = VMap.lookup(I);
  118. }
  119. }
  120. // Adding a value to the new PHI node from the last prolog block
  121. // that was created.
  122. NewPN->addIncoming(V, PrologLatch);
  123. // Update the existing PHI node operand with the value from the
  124. // new PHI node. How this is done depends on if the existing
  125. // PHI node is in the original loop block, or the exit block.
  126. if (L->contains(&PN))
  127. PN.setIncomingValueForBlock(NewPreHeader, NewPN);
  128. else
  129. PN.addIncoming(NewPN, PrologExit);
  130. SE.forgetValue(&PN);
  131. }
  132. }
  133. // Make sure that created prolog loop is in simplified form
  134. SmallVector<BasicBlock *, 4> PrologExitPreds;
  135. Loop *PrologLoop = LI->getLoopFor(PrologLatch);
  136. if (PrologLoop) {
  137. for (BasicBlock *PredBB : predecessors(PrologExit))
  138. if (PrologLoop->contains(PredBB))
  139. PrologExitPreds.push_back(PredBB);
  140. SplitBlockPredecessors(PrologExit, PrologExitPreds, ".unr-lcssa", DT, LI,
  141. nullptr, PreserveLCSSA);
  142. }
  143. // Create a branch around the original loop, which is taken if there are no
  144. // iterations remaining to be executed after running the prologue.
  145. Instruction *InsertPt = PrologExit->getTerminator();
  146. IRBuilder<> B(InsertPt);
  147. assert(Count != 0 && "nonsensical Count!");
  148. // If BECount <u (Count - 1) then (BECount + 1) % Count == (BECount + 1)
  149. // This means %xtraiter is (BECount + 1) and all of the iterations of this
  150. // loop were executed by the prologue. Note that if BECount <u (Count - 1)
  151. // then (BECount + 1) cannot unsigned-overflow.
  152. Value *BrLoopExit =
  153. B.CreateICmpULT(BECount, ConstantInt::get(BECount->getType(), Count - 1));
  154. // Split the exit to maintain loop canonicalization guarantees
  155. SmallVector<BasicBlock *, 4> Preds(predecessors(OriginalLoopLatchExit));
  156. SplitBlockPredecessors(OriginalLoopLatchExit, Preds, ".unr-lcssa", DT, LI,
  157. nullptr, PreserveLCSSA);
  158. // Add the branch to the exit block (around the unrolled loop)
  159. B.CreateCondBr(BrLoopExit, OriginalLoopLatchExit, NewPreHeader);
  160. InsertPt->eraseFromParent();
  161. if (DT) {
  162. auto *NewDom = DT->findNearestCommonDominator(OriginalLoopLatchExit,
  163. PrologExit);
  164. DT->changeImmediateDominator(OriginalLoopLatchExit, NewDom);
  165. }
  166. }
  167. /// Connect the unrolling epilog code to the original loop.
  168. /// The unrolling epilog code contains code to execute the
  169. /// 'extra' iterations if the run-time trip count modulo the
  170. /// unroll count is non-zero.
  171. ///
  172. /// This function performs the following:
  173. /// - Update PHI nodes at the unrolling loop exit and epilog loop exit
  174. /// - Create PHI nodes at the unrolling loop exit to combine
  175. /// values that exit the unrolling loop code and jump around it.
  176. /// - Update PHI operands in the epilog loop by the new PHI nodes
  177. /// - Branch around the epilog loop if extra iters (ModVal) is zero.
  178. ///
  179. static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
  180. BasicBlock *Exit, BasicBlock *PreHeader,
  181. BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader,
  182. ValueToValueMapTy &VMap, DominatorTree *DT,
  183. LoopInfo *LI, bool PreserveLCSSA,
  184. ScalarEvolution &SE) {
  185. BasicBlock *Latch = L->getLoopLatch();
  186. assert(Latch && "Loop must have a latch");
  187. BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]);
  188. // Loop structure should be the following:
  189. //
  190. // PreHeader
  191. // NewPreHeader
  192. // Header
  193. // ...
  194. // Latch
  195. // NewExit (PN)
  196. // EpilogPreHeader
  197. // EpilogHeader
  198. // ...
  199. // EpilogLatch
  200. // Exit (EpilogPN)
  201. // Update PHI nodes at NewExit and Exit.
  202. for (PHINode &PN : NewExit->phis()) {
  203. // PN should be used in another PHI located in Exit block as
  204. // Exit was split by SplitBlockPredecessors into Exit and NewExit
  205. // Basically it should look like:
  206. // NewExit:
  207. // PN = PHI [I, Latch]
  208. // ...
  209. // Exit:
  210. // EpilogPN = PHI [PN, EpilogPreHeader], [X, Exit2], [Y, Exit2.epil]
  211. //
  212. // Exits from non-latch blocks point to the original exit block and the
  213. // epilogue edges have already been added.
  214. //
  215. // There is EpilogPreHeader incoming block instead of NewExit as
  216. // NewExit was spilt 1 more time to get EpilogPreHeader.
  217. assert(PN.hasOneUse() && "The phi should have 1 use");
  218. PHINode *EpilogPN = cast<PHINode>(PN.use_begin()->getUser());
  219. assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block");
  220. // Add incoming PreHeader from branch around the Loop
  221. PN.addIncoming(UndefValue::get(PN.getType()), PreHeader);
  222. SE.forgetValue(&PN);
  223. Value *V = PN.getIncomingValueForBlock(Latch);
  224. Instruction *I = dyn_cast<Instruction>(V);
  225. if (I && L->contains(I))
  226. // If value comes from an instruction in the loop add VMap value.
  227. V = VMap.lookup(I);
  228. // For the instruction out of the loop, constant or undefined value
  229. // insert value itself.
  230. EpilogPN->addIncoming(V, EpilogLatch);
  231. assert(EpilogPN->getBasicBlockIndex(EpilogPreHeader) >= 0 &&
  232. "EpilogPN should have EpilogPreHeader incoming block");
  233. // Change EpilogPreHeader incoming block to NewExit.
  234. EpilogPN->setIncomingBlock(EpilogPN->getBasicBlockIndex(EpilogPreHeader),
  235. NewExit);
  236. // Now PHIs should look like:
  237. // NewExit:
  238. // PN = PHI [I, Latch], [undef, PreHeader]
  239. // ...
  240. // Exit:
  241. // EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch]
  242. }
  243. // Create PHI nodes at NewExit (from the unrolling loop Latch and PreHeader).
  244. // Update corresponding PHI nodes in epilog loop.
  245. for (BasicBlock *Succ : successors(Latch)) {
  246. // Skip this as we already updated phis in exit blocks.
  247. if (!L->contains(Succ))
  248. continue;
  249. for (PHINode &PN : Succ->phis()) {
  250. // Add new PHI nodes to the loop exit block and update epilog
  251. // PHIs with the new PHI values.
  252. PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr",
  253. NewExit->getFirstNonPHI());
  254. // Adding a value to the new PHI node from the unrolling loop preheader.
  255. NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader), PreHeader);
  256. // Adding a value to the new PHI node from the unrolling loop latch.
  257. NewPN->addIncoming(PN.getIncomingValueForBlock(Latch), Latch);
  258. // Update the existing PHI node operand with the value from the new PHI
  259. // node. Corresponding instruction in epilog loop should be PHI.
  260. PHINode *VPN = cast<PHINode>(VMap[&PN]);
  261. VPN->setIncomingValueForBlock(EpilogPreHeader, NewPN);
  262. }
  263. }
  264. Instruction *InsertPt = NewExit->getTerminator();
  265. IRBuilder<> B(InsertPt);
  266. Value *BrLoopExit = B.CreateIsNotNull(ModVal, "lcmp.mod");
  267. assert(Exit && "Loop must have a single exit block only");
  268. // Split the epilogue exit to maintain loop canonicalization guarantees
  269. SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
  270. SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI, nullptr,
  271. PreserveLCSSA);
  272. // Add the branch to the exit block (around the unrolling loop)
  273. B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit);
  274. InsertPt->eraseFromParent();
  275. if (DT) {
  276. auto *NewDom = DT->findNearestCommonDominator(Exit, NewExit);
  277. DT->changeImmediateDominator(Exit, NewDom);
  278. }
  279. // Split the main loop exit to maintain canonicalization guarantees.
  280. SmallVector<BasicBlock*, 4> NewExitPreds{Latch};
  281. SplitBlockPredecessors(NewExit, NewExitPreds, ".loopexit", DT, LI, nullptr,
  282. PreserveLCSSA);
  283. }
  284. /// Create a clone of the blocks in a loop and connect them together. A new
  285. /// loop will be created including all cloned blocks, and the iterator of the
  286. /// new loop switched to count NewIter down to 0.
  287. /// The cloned blocks should be inserted between InsertTop and InsertBot.
  288. /// InsertTop should be new preheader, InsertBot new loop exit.
  289. /// Returns the new cloned loop that is created.
  290. static Loop *
  291. CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,
  292. const bool UnrollRemainder,
  293. BasicBlock *InsertTop,
  294. BasicBlock *InsertBot, BasicBlock *Preheader,
  295. std::vector<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
  296. ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI) {
  297. StringRef suffix = UseEpilogRemainder ? "epil" : "prol";
  298. BasicBlock *Header = L->getHeader();
  299. BasicBlock *Latch = L->getLoopLatch();
  300. Function *F = Header->getParent();
  301. LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
  302. LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
  303. Loop *ParentLoop = L->getParentLoop();
  304. NewLoopsMap NewLoops;
  305. NewLoops[ParentLoop] = ParentLoop;
  306. // For each block in the original loop, create a new copy,
  307. // and update the value map with the newly created values.
  308. for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
  309. BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F);
  310. NewBlocks.push_back(NewBB);
  311. addClonedBlockToLoopInfo(*BB, NewBB, LI, NewLoops);
  312. VMap[*BB] = NewBB;
  313. if (Header == *BB) {
  314. // For the first block, add a CFG connection to this newly
  315. // created block.
  316. InsertTop->getTerminator()->setSuccessor(0, NewBB);
  317. }
  318. if (DT) {
  319. if (Header == *BB) {
  320. // The header is dominated by the preheader.
  321. DT->addNewBlock(NewBB, InsertTop);
  322. } else {
  323. // Copy information from original loop to unrolled loop.
  324. BasicBlock *IDomBB = DT->getNode(*BB)->getIDom()->getBlock();
  325. DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB]));
  326. }
  327. }
  328. if (Latch == *BB) {
  329. // For the last block, create a loop back to cloned head.
  330. VMap.erase((*BB)->getTerminator());
  331. // Use an incrementing IV. Pre-incr/post-incr is backedge/trip count.
  332. // Subtle: NewIter can be 0 if we wrapped when computing the trip count,
  333. // thus we must compare the post-increment (wrapping) value.
  334. BasicBlock *FirstLoopBB = cast<BasicBlock>(VMap[Header]);
  335. BranchInst *LatchBR = cast<BranchInst>(NewBB->getTerminator());
  336. IRBuilder<> Builder(LatchBR);
  337. PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2,
  338. suffix + ".iter",
  339. FirstLoopBB->getFirstNonPHI());
  340. auto *Zero = ConstantInt::get(NewIdx->getType(), 0);
  341. auto *One = ConstantInt::get(NewIdx->getType(), 1);
  342. Value *IdxNext = Builder.CreateAdd(NewIdx, One, NewIdx->getName() + ".next");
  343. Value *IdxCmp = Builder.CreateICmpNE(IdxNext, NewIter, NewIdx->getName() + ".cmp");
  344. Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot);
  345. NewIdx->addIncoming(Zero, InsertTop);
  346. NewIdx->addIncoming(IdxNext, NewBB);
  347. LatchBR->eraseFromParent();
  348. }
  349. }
  350. // Change the incoming values to the ones defined in the preheader or
  351. // cloned loop.
  352. for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
  353. PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
  354. unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
  355. NewPHI->setIncomingBlock(idx, InsertTop);
  356. BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]);
  357. idx = NewPHI->getBasicBlockIndex(Latch);
  358. Value *InVal = NewPHI->getIncomingValue(idx);
  359. NewPHI->setIncomingBlock(idx, NewLatch);
  360. if (Value *V = VMap.lookup(InVal))
  361. NewPHI->setIncomingValue(idx, V);
  362. }
  363. Loop *NewLoop = NewLoops[L];
  364. assert(NewLoop && "L should have been cloned");
  365. MDNode *LoopID = NewLoop->getLoopID();
  366. // Only add loop metadata if the loop is not going to be completely
  367. // unrolled.
  368. if (UnrollRemainder)
  369. return NewLoop;
  370. std::optional<MDNode *> NewLoopID = makeFollowupLoopID(
  371. LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder});
  372. if (NewLoopID) {
  373. NewLoop->setLoopID(*NewLoopID);
  374. // Do not setLoopAlreadyUnrolled if loop attributes have been defined
  375. // explicitly.
  376. return NewLoop;
  377. }
  378. // Add unroll disable metadata to disable future unrolling for this loop.
  379. NewLoop->setLoopAlreadyUnrolled();
  380. return NewLoop;
  381. }
  382. /// Returns true if we can profitably unroll the multi-exit loop L. Currently,
  383. /// we return true only if UnrollRuntimeMultiExit is set to true.
  384. static bool canProfitablyUnrollMultiExitLoop(
  385. Loop *L, SmallVectorImpl<BasicBlock *> &OtherExits, BasicBlock *LatchExit,
  386. bool UseEpilogRemainder) {
  387. // Priority goes to UnrollRuntimeMultiExit if it's supplied.
  388. if (UnrollRuntimeMultiExit.getNumOccurrences())
  389. return UnrollRuntimeMultiExit;
  390. // The main pain point with multi-exit loop unrolling is that once unrolled,
  391. // we will not be able to merge all blocks into a straight line code.
  392. // There are branches within the unrolled loop that go to the OtherExits.
  393. // The second point is the increase in code size, but this is true
  394. // irrespective of multiple exits.
  395. // Note: Both the heuristics below are coarse grained. We are essentially
  396. // enabling unrolling of loops that have a single side exit other than the
  397. // normal LatchExit (i.e. exiting into a deoptimize block).
  398. // The heuristics considered are:
  399. // 1. low number of branches in the unrolled version.
  400. // 2. high predictability of these extra branches.
  401. // We avoid unrolling loops that have more than two exiting blocks. This
  402. // limits the total number of branches in the unrolled loop to be atmost
  403. // the unroll factor (since one of the exiting blocks is the latch block).
  404. SmallVector<BasicBlock*, 4> ExitingBlocks;
  405. L->getExitingBlocks(ExitingBlocks);
  406. if (ExitingBlocks.size() > 2)
  407. return false;
  408. // Allow unrolling of loops with no non latch exit blocks.
  409. if (OtherExits.size() == 0)
  410. return true;
  411. // The second heuristic is that L has one exit other than the latchexit and
  412. // that exit is a deoptimize block. We know that deoptimize blocks are rarely
  413. // taken, which also implies the branch leading to the deoptimize block is
  414. // highly predictable. When UnrollRuntimeOtherExitPredictable is specified, we
  415. // assume the other exit branch is predictable even if it has no deoptimize
  416. // call.
  417. return (OtherExits.size() == 1 &&
  418. (UnrollRuntimeOtherExitPredictable ||
  419. OtherExits[0]->getTerminatingDeoptimizeCall()));
  420. // TODO: These can be fine-tuned further to consider code size or deopt states
  421. // that are captured by the deoptimize exit block.
  422. // Also, we can extend this to support more cases, if we actually
  423. // know of kinds of multiexit loops that would benefit from unrolling.
  424. }
  425. // Assign the maximum possible trip count as the back edge weight for the
  426. // remainder loop if the original loop comes with a branch weight.
  427. static void updateLatchBranchWeightsForRemainderLoop(Loop *OrigLoop,
  428. Loop *RemainderLoop,
  429. uint64_t UnrollFactor) {
  430. uint64_t TrueWeight, FalseWeight;
  431. BranchInst *LatchBR =
  432. cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
  433. if (!extractBranchWeights(*LatchBR, TrueWeight, FalseWeight))
  434. return;
  435. uint64_t ExitWeight = LatchBR->getSuccessor(0) == OrigLoop->getHeader()
  436. ? FalseWeight
  437. : TrueWeight;
  438. assert(UnrollFactor > 1);
  439. uint64_t BackEdgeWeight = (UnrollFactor - 1) * ExitWeight;
  440. BasicBlock *Header = RemainderLoop->getHeader();
  441. BasicBlock *Latch = RemainderLoop->getLoopLatch();
  442. auto *RemainderLatchBR = cast<BranchInst>(Latch->getTerminator());
  443. unsigned HeaderIdx = (RemainderLatchBR->getSuccessor(0) == Header ? 0 : 1);
  444. MDBuilder MDB(RemainderLatchBR->getContext());
  445. MDNode *WeightNode =
  446. HeaderIdx ? MDB.createBranchWeights(ExitWeight, BackEdgeWeight)
  447. : MDB.createBranchWeights(BackEdgeWeight, ExitWeight);
  448. RemainderLatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
  449. }
  450. /// Calculate ModVal = (BECount + 1) % Count on the abstract integer domain
  451. /// accounting for the possibility of unsigned overflow in the 2s complement
  452. /// domain. Preconditions:
  453. /// 1) TripCount = BECount + 1 (allowing overflow)
  454. /// 2) Log2(Count) <= BitWidth(BECount)
  455. static Value *CreateTripRemainder(IRBuilder<> &B, Value *BECount,
  456. Value *TripCount, unsigned Count) {
  457. // Note that TripCount is BECount + 1.
  458. if (isPowerOf2_32(Count))
  459. // If the expression is zero, then either:
  460. // 1. There are no iterations to be run in the prolog/epilog loop.
  461. // OR
  462. // 2. The addition computing TripCount overflowed.
  463. //
  464. // If (2) is true, we know that TripCount really is (1 << BEWidth) and so
  465. // the number of iterations that remain to be run in the original loop is a
  466. // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (a
  467. // precondition of this method).
  468. return B.CreateAnd(TripCount, Count - 1, "xtraiter");
  469. // As (BECount + 1) can potentially unsigned overflow we count
  470. // (BECount % Count) + 1 which is overflow safe as BECount % Count < Count.
  471. Constant *CountC = ConstantInt::get(BECount->getType(), Count);
  472. Value *ModValTmp = B.CreateURem(BECount, CountC);
  473. Value *ModValAdd = B.CreateAdd(ModValTmp,
  474. ConstantInt::get(ModValTmp->getType(), 1));
  475. // At that point (BECount % Count) + 1 could be equal to Count.
  476. // To handle this case we need to take mod by Count one more time.
  477. return B.CreateURem(ModValAdd, CountC, "xtraiter");
  478. }
  479. /// Insert code in the prolog/epilog code when unrolling a loop with a
  480. /// run-time trip-count.
  481. ///
  482. /// This method assumes that the loop unroll factor is total number
  483. /// of loop bodies in the loop after unrolling. (Some folks refer
  484. /// to the unroll factor as the number of *extra* copies added).
  485. /// We assume also that the loop unroll factor is a power-of-two. So, after
  486. /// unrolling the loop, the number of loop bodies executed is 2,
  487. /// 4, 8, etc. Note - LLVM converts the if-then-sequence to a switch
  488. /// instruction in SimplifyCFG.cpp. Then, the backend decides how code for
  489. /// the switch instruction is generated.
  490. ///
  491. /// ***Prolog case***
  492. /// extraiters = tripcount % loopfactor
  493. /// if (extraiters == 0) jump Loop:
  494. /// else jump Prol:
  495. /// Prol: LoopBody;
  496. /// extraiters -= 1 // Omitted if unroll factor is 2.
  497. /// if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2.
  498. /// if (tripcount < loopfactor) jump End:
  499. /// Loop:
  500. /// ...
  501. /// End:
  502. ///
  503. /// ***Epilog case***
  504. /// extraiters = tripcount % loopfactor
  505. /// if (tripcount < loopfactor) jump LoopExit:
  506. /// unroll_iters = tripcount - extraiters
  507. /// Loop: LoopBody; (executes unroll_iter times);
  508. /// unroll_iter -= 1
  509. /// if (unroll_iter != 0) jump Loop:
  510. /// LoopExit:
  511. /// if (extraiters == 0) jump EpilExit:
  512. /// Epil: LoopBody; (executes extraiters times)
  513. /// extraiters -= 1 // Omitted if unroll factor is 2.
  514. /// if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2.
  515. /// EpilExit:
  516. bool llvm::UnrollRuntimeLoopRemainder(
  517. Loop *L, unsigned Count, bool AllowExpensiveTripCount,
  518. bool UseEpilogRemainder, bool UnrollRemainder, bool ForgetAllSCEV,
  519. LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
  520. const TargetTransformInfo *TTI, bool PreserveLCSSA, Loop **ResultLoop) {
  521. LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
  522. LLVM_DEBUG(L->dump());
  523. LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n"
  524. : dbgs() << "Using prolog remainder.\n");
  525. // Make sure the loop is in canonical form.
  526. if (!L->isLoopSimplifyForm()) {
  527. LLVM_DEBUG(dbgs() << "Not in simplify form!\n");
  528. return false;
  529. }
  530. // Guaranteed by LoopSimplifyForm.
  531. BasicBlock *Latch = L->getLoopLatch();
  532. BasicBlock *Header = L->getHeader();
  533. BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
  534. if (!LatchBR || LatchBR->isUnconditional()) {
  535. // The loop-rotate pass can be helpful to avoid this in many cases.
  536. LLVM_DEBUG(
  537. dbgs()
  538. << "Loop latch not terminated by a conditional branch.\n");
  539. return false;
  540. }
  541. unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;
  542. BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex);
  543. if (L->contains(LatchExit)) {
  544. // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the
  545. // targets of the Latch be an exit block out of the loop.
  546. LLVM_DEBUG(
  547. dbgs()
  548. << "One of the loop latch successors must be the exit block.\n");
  549. return false;
  550. }
  551. // These are exit blocks other than the target of the latch exiting block.
  552. SmallVector<BasicBlock *, 4> OtherExits;
  553. L->getUniqueNonLatchExitBlocks(OtherExits);
  554. // Support only single exit and exiting block unless multi-exit loop
  555. // unrolling is enabled.
  556. if (!L->getExitingBlock() || OtherExits.size()) {
  557. // We rely on LCSSA form being preserved when the exit blocks are transformed.
  558. // (Note that only an off-by-default mode of the old PM disables PreserveLCCA.)
  559. if (!PreserveLCSSA)
  560. return false;
  561. if (!canProfitablyUnrollMultiExitLoop(L, OtherExits, LatchExit,
  562. UseEpilogRemainder)) {
  563. LLVM_DEBUG(
  564. dbgs()
  565. << "Multiple exit/exiting blocks in loop and multi-exit unrolling not "
  566. "enabled!\n");
  567. return false;
  568. }
  569. }
  570. // Use Scalar Evolution to compute the trip count. This allows more loops to
  571. // be unrolled than relying on induction var simplification.
  572. if (!SE)
  573. return false;
  574. // Only unroll loops with a computable trip count.
  575. // We calculate the backedge count by using getExitCount on the Latch block,
  576. // which is proven to be the only exiting block in this loop. This is same as
  577. // calculating getBackedgeTakenCount on the loop (which computes SCEV for all
  578. // exiting blocks).
  579. const SCEV *BECountSC = SE->getExitCount(L, Latch);
  580. if (isa<SCEVCouldNotCompute>(BECountSC)) {
  581. LLVM_DEBUG(dbgs() << "Could not compute exit block SCEV\n");
  582. return false;
  583. }
  584. unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth();
  585. // Add 1 since the backedge count doesn't include the first loop iteration.
  586. // (Note that overflow can occur, this is handled explicitly below)
  587. const SCEV *TripCountSC =
  588. SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
  589. if (isa<SCEVCouldNotCompute>(TripCountSC)) {
  590. LLVM_DEBUG(dbgs() << "Could not compute trip count SCEV.\n");
  591. return false;
  592. }
  593. BasicBlock *PreHeader = L->getLoopPreheader();
  594. BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
  595. const DataLayout &DL = Header->getModule()->getDataLayout();
  596. SCEVExpander Expander(*SE, DL, "loop-unroll");
  597. if (!AllowExpensiveTripCount &&
  598. Expander.isHighCostExpansion(TripCountSC, L, SCEVCheapExpansionBudget,
  599. TTI, PreHeaderBR)) {
  600. LLVM_DEBUG(dbgs() << "High cost for expanding trip count scev!\n");
  601. return false;
  602. }
  603. // This constraint lets us deal with an overflowing trip count easily; see the
  604. // comment on ModVal below.
  605. if (Log2_32(Count) > BEWidth) {
  606. LLVM_DEBUG(
  607. dbgs()
  608. << "Count failed constraint on overflow trip count calculation.\n");
  609. return false;
  610. }
  611. // Loop structure is the following:
  612. //
  613. // PreHeader
  614. // Header
  615. // ...
  616. // Latch
  617. // LatchExit
  618. BasicBlock *NewPreHeader;
  619. BasicBlock *NewExit = nullptr;
  620. BasicBlock *PrologExit = nullptr;
  621. BasicBlock *EpilogPreHeader = nullptr;
  622. BasicBlock *PrologPreHeader = nullptr;
  623. if (UseEpilogRemainder) {
  624. // If epilog remainder
  625. // Split PreHeader to insert a branch around loop for unrolling.
  626. NewPreHeader = SplitBlock(PreHeader, PreHeader->getTerminator(), DT, LI);
  627. NewPreHeader->setName(PreHeader->getName() + ".new");
  628. // Split LatchExit to create phi nodes from branch above.
  629. NewExit = SplitBlockPredecessors(LatchExit, {Latch}, ".unr-lcssa", DT, LI,
  630. nullptr, PreserveLCSSA);
  631. // NewExit gets its DebugLoc from LatchExit, which is not part of the
  632. // original Loop.
  633. // Fix this by setting Loop's DebugLoc to NewExit.
  634. auto *NewExitTerminator = NewExit->getTerminator();
  635. NewExitTerminator->setDebugLoc(Header->getTerminator()->getDebugLoc());
  636. // Split NewExit to insert epilog remainder loop.
  637. EpilogPreHeader = SplitBlock(NewExit, NewExitTerminator, DT, LI);
  638. EpilogPreHeader->setName(Header->getName() + ".epil.preheader");
  639. // If the latch exits from multiple level of nested loops, then
  640. // by assumption there must be another loop exit which branches to the
  641. // outer loop and we must adjust the loop for the newly inserted blocks
  642. // to account for the fact that our epilogue is still in the same outer
  643. // loop. Note that this leaves loopinfo temporarily out of sync with the
  644. // CFG until the actual epilogue loop is inserted.
  645. if (auto *ParentL = L->getParentLoop())
  646. if (LI->getLoopFor(LatchExit) != ParentL) {
  647. LI->removeBlock(NewExit);
  648. ParentL->addBasicBlockToLoop(NewExit, *LI);
  649. LI->removeBlock(EpilogPreHeader);
  650. ParentL->addBasicBlockToLoop(EpilogPreHeader, *LI);
  651. }
  652. } else {
  653. // If prolog remainder
  654. // Split the original preheader twice to insert prolog remainder loop
  655. PrologPreHeader = SplitEdge(PreHeader, Header, DT, LI);
  656. PrologPreHeader->setName(Header->getName() + ".prol.preheader");
  657. PrologExit = SplitBlock(PrologPreHeader, PrologPreHeader->getTerminator(),
  658. DT, LI);
  659. PrologExit->setName(Header->getName() + ".prol.loopexit");
  660. // Split PrologExit to get NewPreHeader.
  661. NewPreHeader = SplitBlock(PrologExit, PrologExit->getTerminator(), DT, LI);
  662. NewPreHeader->setName(PreHeader->getName() + ".new");
  663. }
  664. // Loop structure should be the following:
  665. // Epilog Prolog
  666. //
  667. // PreHeader PreHeader
  668. // *NewPreHeader *PrologPreHeader
  669. // Header *PrologExit
  670. // ... *NewPreHeader
  671. // Latch Header
  672. // *NewExit ...
  673. // *EpilogPreHeader Latch
  674. // LatchExit LatchExit
  675. // Calculate conditions for branch around loop for unrolling
  676. // in epilog case and around prolog remainder loop in prolog case.
  677. // Compute the number of extra iterations required, which is:
  678. // extra iterations = run-time trip count % loop unroll factor
  679. PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
  680. IRBuilder<> B(PreHeaderBR);
  681. Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
  682. PreHeaderBR);
  683. Value *BECount;
  684. // If there are other exits before the latch, that may cause the latch exit
  685. // branch to never be executed, and the latch exit count may be poison.
  686. // In this case, freeze the TripCount and base BECount on the frozen
  687. // TripCount. We will introduce two branches using these values, and it's
  688. // important that they see a consistent value (which would not be guaranteed
  689. // if were frozen independently.)
  690. if ((!OtherExits.empty() || !SE->loopHasNoAbnormalExits(L)) &&
  691. !isGuaranteedNotToBeUndefOrPoison(TripCount, AC, PreHeaderBR, DT)) {
  692. TripCount = B.CreateFreeze(TripCount);
  693. BECount =
  694. B.CreateAdd(TripCount, ConstantInt::get(TripCount->getType(), -1));
  695. } else {
  696. // If we don't need to freeze, use SCEVExpander for BECount as well, to
  697. // allow slightly better value reuse.
  698. BECount =
  699. Expander.expandCodeFor(BECountSC, BECountSC->getType(), PreHeaderBR);
  700. }
  701. Value * const ModVal = CreateTripRemainder(B, BECount, TripCount, Count);
  702. Value *BranchVal =
  703. UseEpilogRemainder ? B.CreateICmpULT(BECount,
  704. ConstantInt::get(BECount->getType(),
  705. Count - 1)) :
  706. B.CreateIsNotNull(ModVal, "lcmp.mod");
  707. BasicBlock *RemainderLoop = UseEpilogRemainder ? NewExit : PrologPreHeader;
  708. BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit;
  709. // Branch to either remainder (extra iterations) loop or unrolling loop.
  710. B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop);
  711. PreHeaderBR->eraseFromParent();
  712. if (DT) {
  713. if (UseEpilogRemainder)
  714. DT->changeImmediateDominator(NewExit, PreHeader);
  715. else
  716. DT->changeImmediateDominator(PrologExit, PreHeader);
  717. }
  718. Function *F = Header->getParent();
  719. // Get an ordered list of blocks in the loop to help with the ordering of the
  720. // cloned blocks in the prolog/epilog code
  721. LoopBlocksDFS LoopBlocks(L);
  722. LoopBlocks.perform(LI);
  723. //
  724. // For each extra loop iteration, create a copy of the loop's basic blocks
  725. // and generate a condition that branches to the copy depending on the
  726. // number of 'left over' iterations.
  727. //
  728. std::vector<BasicBlock *> NewBlocks;
  729. ValueToValueMapTy VMap;
  730. // Clone all the basic blocks in the loop. If Count is 2, we don't clone
  731. // the loop, otherwise we create a cloned loop to execute the extra
  732. // iterations. This function adds the appropriate CFG connections.
  733. BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;
  734. BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
  735. Loop *remainderLoop = CloneLoopBlocks(
  736. L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop, InsertBot,
  737. NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI);
  738. // Assign the maximum possible trip count as the back edge weight for the
  739. // remainder loop if the original loop comes with a branch weight.
  740. if (remainderLoop && !UnrollRemainder)
  741. updateLatchBranchWeightsForRemainderLoop(L, remainderLoop, Count);
  742. // Insert the cloned blocks into the function.
  743. F->splice(InsertBot->getIterator(), F, NewBlocks[0]->getIterator(), F->end());
  744. // Now the loop blocks are cloned and the other exiting blocks from the
  745. // remainder are connected to the original Loop's exit blocks. The remaining
  746. // work is to update the phi nodes in the original loop, and take in the
  747. // values from the cloned region.
  748. for (auto *BB : OtherExits) {
  749. // Given we preserve LCSSA form, we know that the values used outside the
  750. // loop will be used through these phi nodes at the exit blocks that are
  751. // transformed below.
  752. for (PHINode &PN : BB->phis()) {
  753. unsigned oldNumOperands = PN.getNumIncomingValues();
  754. // Add the incoming values from the remainder code to the end of the phi
  755. // node.
  756. for (unsigned i = 0; i < oldNumOperands; i++){
  757. auto *PredBB =PN.getIncomingBlock(i);
  758. if (PredBB == Latch)
  759. // The latch exit is handled seperately, see connectX
  760. continue;
  761. if (!L->contains(PredBB))
  762. // Even if we had dedicated exits, the code above inserted an
  763. // extra branch which can reach the latch exit.
  764. continue;
  765. auto *V = PN.getIncomingValue(i);
  766. if (Instruction *I = dyn_cast<Instruction>(V))
  767. if (L->contains(I))
  768. V = VMap.lookup(I);
  769. PN.addIncoming(V, cast<BasicBlock>(VMap[PredBB]));
  770. }
  771. }
  772. #if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
  773. for (BasicBlock *SuccBB : successors(BB)) {
  774. assert(!(llvm::is_contained(OtherExits, SuccBB) || SuccBB == LatchExit) &&
  775. "Breaks the definition of dedicated exits!");
  776. }
  777. #endif
  778. }
  779. // Update the immediate dominator of the exit blocks and blocks that are
  780. // reachable from the exit blocks. This is needed because we now have paths
  781. // from both the original loop and the remainder code reaching the exit
  782. // blocks. While the IDom of these exit blocks were from the original loop,
  783. // now the IDom is the preheader (which decides whether the original loop or
  784. // remainder code should run).
  785. if (DT && !L->getExitingBlock()) {
  786. SmallVector<BasicBlock *, 16> ChildrenToUpdate;
  787. // NB! We have to examine the dom children of all loop blocks, not just
  788. // those which are the IDom of the exit blocks. This is because blocks
  789. // reachable from the exit blocks can have their IDom as the nearest common
  790. // dominator of the exit blocks.
  791. for (auto *BB : L->blocks()) {
  792. auto *DomNodeBB = DT->getNode(BB);
  793. for (auto *DomChild : DomNodeBB->children()) {
  794. auto *DomChildBB = DomChild->getBlock();
  795. if (!L->contains(LI->getLoopFor(DomChildBB)))
  796. ChildrenToUpdate.push_back(DomChildBB);
  797. }
  798. }
  799. for (auto *BB : ChildrenToUpdate)
  800. DT->changeImmediateDominator(BB, PreHeader);
  801. }
  802. // Loop structure should be the following:
  803. // Epilog Prolog
  804. //
  805. // PreHeader PreHeader
  806. // NewPreHeader PrologPreHeader
  807. // Header PrologHeader
  808. // ... ...
  809. // Latch PrologLatch
  810. // NewExit PrologExit
  811. // EpilogPreHeader NewPreHeader
  812. // EpilogHeader Header
  813. // ... ...
  814. // EpilogLatch Latch
  815. // LatchExit LatchExit
  816. // Rewrite the cloned instruction operands to use the values created when the
  817. // clone is created.
  818. for (BasicBlock *BB : NewBlocks) {
  819. for (Instruction &I : *BB) {
  820. RemapInstruction(&I, VMap,
  821. RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
  822. }
  823. }
  824. if (UseEpilogRemainder) {
  825. // Connect the epilog code to the original loop and update the
  826. // PHI functions.
  827. ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader, EpilogPreHeader,
  828. NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE);
  829. // Update counter in loop for unrolling.
  830. // Use an incrementing IV. Pre-incr/post-incr is backedge/trip count.
  831. // Subtle: TestVal can be 0 if we wrapped when computing the trip count,
  832. // thus we must compare the post-increment (wrapping) value.
  833. IRBuilder<> B2(NewPreHeader->getTerminator());
  834. Value *TestVal = B2.CreateSub(TripCount, ModVal, "unroll_iter");
  835. BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
  836. PHINode *NewIdx = PHINode::Create(TestVal->getType(), 2, "niter",
  837. Header->getFirstNonPHI());
  838. B2.SetInsertPoint(LatchBR);
  839. auto *Zero = ConstantInt::get(NewIdx->getType(), 0);
  840. auto *One = ConstantInt::get(NewIdx->getType(), 1);
  841. Value *IdxNext = B2.CreateAdd(NewIdx, One, NewIdx->getName() + ".next");
  842. auto Pred = LatchBR->getSuccessor(0) == Header ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;
  843. Value *IdxCmp = B2.CreateICmp(Pred, IdxNext, TestVal, NewIdx->getName() + ".ncmp");
  844. NewIdx->addIncoming(Zero, NewPreHeader);
  845. NewIdx->addIncoming(IdxNext, Latch);
  846. LatchBR->setCondition(IdxCmp);
  847. } else {
  848. // Connect the prolog code to the original loop and update the
  849. // PHI functions.
  850. ConnectProlog(L, BECount, Count, PrologExit, LatchExit, PreHeader,
  851. NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE);
  852. }
  853. // If this loop is nested, then the loop unroller changes the code in the any
  854. // of its parent loops, so the Scalar Evolution pass needs to be run again.
  855. SE->forgetTopmostLoop(L);
  856. // Verify that the Dom Tree and Loop Info are correct.
  857. #if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
  858. if (DT) {
  859. assert(DT->verify(DominatorTree::VerificationLevel::Full));
  860. LI->verify(*DT);
  861. }
  862. #endif
  863. // For unroll factor 2 remainder loop will have 1 iteration.
  864. if (Count == 2 && DT && LI && SE) {
  865. // TODO: This code could probably be pulled out into a helper function
  866. // (e.g. breakLoopBackedgeAndSimplify) and reused in loop-deletion.
  867. BasicBlock *RemainderLatch = remainderLoop->getLoopLatch();
  868. assert(RemainderLatch);
  869. SmallVector<BasicBlock*> RemainderBlocks(remainderLoop->getBlocks().begin(),
  870. remainderLoop->getBlocks().end());
  871. breakLoopBackedge(remainderLoop, *DT, *SE, *LI, nullptr);
  872. remainderLoop = nullptr;
  873. // Simplify loop values after breaking the backedge
  874. const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
  875. SmallVector<WeakTrackingVH, 16> DeadInsts;
  876. for (BasicBlock *BB : RemainderBlocks) {
  877. for (Instruction &Inst : llvm::make_early_inc_range(*BB)) {
  878. if (Value *V = simplifyInstruction(&Inst, {DL, nullptr, DT, AC}))
  879. if (LI->replacementPreservesLCSSAForm(&Inst, V))
  880. Inst.replaceAllUsesWith(V);
  881. if (isInstructionTriviallyDead(&Inst))
  882. DeadInsts.emplace_back(&Inst);
  883. }
  884. // We can't do recursive deletion until we're done iterating, as we might
  885. // have a phi which (potentially indirectly) uses instructions later in
  886. // the block we're iterating through.
  887. RecursivelyDeleteTriviallyDeadInstructions(DeadInsts);
  888. }
  889. // Merge latch into exit block.
  890. auto *ExitBB = RemainderLatch->getSingleSuccessor();
  891. assert(ExitBB && "required after breaking cond br backedge");
  892. DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
  893. MergeBlockIntoPredecessor(ExitBB, &DTU, LI);
  894. }
  895. // Canonicalize to LoopSimplifyForm both original and remainder loops. We
  896. // cannot rely on the LoopUnrollPass to do this because it only does
  897. // canonicalization for parent/subloops and not the sibling loops.
  898. if (OtherExits.size() > 0) {
  899. // Generate dedicated exit blocks for the original loop, to preserve
  900. // LoopSimplifyForm.
  901. formDedicatedExitBlocks(L, DT, LI, nullptr, PreserveLCSSA);
  902. // Generate dedicated exit blocks for the remainder loop if one exists, to
  903. // preserve LoopSimplifyForm.
  904. if (remainderLoop)
  905. formDedicatedExitBlocks(remainderLoop, DT, LI, nullptr, PreserveLCSSA);
  906. }
  907. auto UnrollResult = LoopUnrollResult::Unmodified;
  908. if (remainderLoop && UnrollRemainder) {
  909. LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n");
  910. UnrollResult =
  911. UnrollLoop(remainderLoop,
  912. {/*Count*/ Count - 1, /*Force*/ false, /*Runtime*/ false,
  913. /*AllowExpensiveTripCount*/ false,
  914. /*UnrollRemainder*/ false, ForgetAllSCEV},
  915. LI, SE, DT, AC, TTI, /*ORE*/ nullptr, PreserveLCSSA);
  916. }
  917. if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled)
  918. *ResultLoop = remainderLoop;
  919. NumRuntimeUnrolled++;
  920. return true;
  921. }