LoopUnrollRuntime.cpp 42 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989
  1. //===-- UnrollLoopRuntime.cpp - Runtime Loop unrolling utilities ----------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file implements some loop unrolling utilities for loops with run-time
  10. // trip counts. See LoopUnroll.cpp for unrolling loops with compile-time
  11. // trip counts.
  12. //
  13. // The functions in this file are used to generate extra code when the
  14. // run-time trip count modulo the unroll factor is not 0. When this is the
  15. // case, we need to generate code to execute these 'left over' iterations.
  16. //
  17. // The current strategy generates an if-then-else sequence prior to the
  18. // unrolled loop to execute the 'left over' iterations before or after the
  19. // unrolled loop.
  20. //
  21. //===----------------------------------------------------------------------===//
  22. #include "llvm/ADT/SmallPtrSet.h"
  23. #include "llvm/ADT/Statistic.h"
  24. #include "llvm/Analysis/LoopIterator.h"
  25. #include "llvm/Analysis/ScalarEvolution.h"
  26. #include "llvm/IR/BasicBlock.h"
  27. #include "llvm/IR/Dominators.h"
  28. #include "llvm/IR/MDBuilder.h"
  29. #include "llvm/IR/Metadata.h"
  30. #include "llvm/IR/Module.h"
  31. #include "llvm/Support/CommandLine.h"
  32. #include "llvm/Support/Debug.h"
  33. #include "llvm/Support/raw_ostream.h"
  34. #include "llvm/Transforms/Utils.h"
  35. #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  36. #include "llvm/Transforms/Utils/Cloning.h"
  37. #include "llvm/Transforms/Utils/LoopUtils.h"
  38. #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
  39. #include "llvm/Transforms/Utils/UnrollLoop.h"
  40. #include <algorithm>
  41. using namespace llvm;
  42. #define DEBUG_TYPE "loop-unroll"
  43. STATISTIC(NumRuntimeUnrolled,
  44. "Number of loops unrolled with run-time trip counts");
  45. static cl::opt<bool> UnrollRuntimeMultiExit(
  46. "unroll-runtime-multi-exit", cl::init(false), cl::Hidden,
  47. cl::desc("Allow runtime unrolling for loops with multiple exits, when "
  48. "epilog is generated"));
  49. /// Connect the unrolling prolog code to the original loop.
  50. /// The unrolling prolog code contains code to execute the
  51. /// 'extra' iterations if the run-time trip count modulo the
  52. /// unroll count is non-zero.
  53. ///
  54. /// This function performs the following:
  55. /// - Create PHI nodes at prolog end block to combine values
  56. /// that exit the prolog code and jump around the prolog.
  57. /// - Add a PHI operand to a PHI node at the loop exit block
  58. /// for values that exit the prolog and go around the loop.
  59. /// - Branch around the original loop if the trip count is less
  60. /// than the unroll factor.
  61. ///
  62. static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
  63. BasicBlock *PrologExit,
  64. BasicBlock *OriginalLoopLatchExit,
  65. BasicBlock *PreHeader, BasicBlock *NewPreHeader,
  66. ValueToValueMapTy &VMap, DominatorTree *DT,
  67. LoopInfo *LI, bool PreserveLCSSA) {
  68. // Loop structure should be the following:
  69. // Preheader
  70. // PrologHeader
  71. // ...
  72. // PrologLatch
  73. // PrologExit
  74. // NewPreheader
  75. // Header
  76. // ...
  77. // Latch
  78. // LatchExit
  79. BasicBlock *Latch = L->getLoopLatch();
  80. assert(Latch && "Loop must have a latch");
  81. BasicBlock *PrologLatch = cast<BasicBlock>(VMap[Latch]);
  82. // Create a PHI node for each outgoing value from the original loop
  83. // (which means it is an outgoing value from the prolog code too).
  84. // The new PHI node is inserted in the prolog end basic block.
  85. // The new PHI node value is added as an operand of a PHI node in either
  86. // the loop header or the loop exit block.
  87. for (BasicBlock *Succ : successors(Latch)) {
  88. for (PHINode &PN : Succ->phis()) {
  89. // Add a new PHI node to the prolog end block and add the
  90. // appropriate incoming values.
  91. // TODO: This code assumes that the PrologExit (or the LatchExit block for
  92. // prolog loop) contains only one predecessor from the loop, i.e. the
  93. // PrologLatch. When supporting multiple-exiting block loops, we can have
  94. // two or more blocks that have the LatchExit as the target in the
  95. // original loop.
  96. PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr",
  97. PrologExit->getFirstNonPHI());
  98. // Adding a value to the new PHI node from the original loop preheader.
  99. // This is the value that skips all the prolog code.
  100. if (L->contains(&PN)) {
  101. // Succ is loop header.
  102. NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader),
  103. PreHeader);
  104. } else {
  105. // Succ is LatchExit.
  106. NewPN->addIncoming(UndefValue::get(PN.getType()), PreHeader);
  107. }
  108. Value *V = PN.getIncomingValueForBlock(Latch);
  109. if (Instruction *I = dyn_cast<Instruction>(V)) {
  110. if (L->contains(I)) {
  111. V = VMap.lookup(I);
  112. }
  113. }
  114. // Adding a value to the new PHI node from the last prolog block
  115. // that was created.
  116. NewPN->addIncoming(V, PrologLatch);
  117. // Update the existing PHI node operand with the value from the
  118. // new PHI node. How this is done depends on if the existing
  119. // PHI node is in the original loop block, or the exit block.
  120. if (L->contains(&PN))
  121. PN.setIncomingValueForBlock(NewPreHeader, NewPN);
  122. else
  123. PN.addIncoming(NewPN, PrologExit);
  124. }
  125. }
  126. // Make sure that created prolog loop is in simplified form
  127. SmallVector<BasicBlock *, 4> PrologExitPreds;
  128. Loop *PrologLoop = LI->getLoopFor(PrologLatch);
  129. if (PrologLoop) {
  130. for (BasicBlock *PredBB : predecessors(PrologExit))
  131. if (PrologLoop->contains(PredBB))
  132. PrologExitPreds.push_back(PredBB);
  133. SplitBlockPredecessors(PrologExit, PrologExitPreds, ".unr-lcssa", DT, LI,
  134. nullptr, PreserveLCSSA);
  135. }
  136. // Create a branch around the original loop, which is taken if there are no
  137. // iterations remaining to be executed after running the prologue.
  138. Instruction *InsertPt = PrologExit->getTerminator();
  139. IRBuilder<> B(InsertPt);
  140. assert(Count != 0 && "nonsensical Count!");
  141. // If BECount <u (Count - 1) then (BECount + 1) % Count == (BECount + 1)
  142. // This means %xtraiter is (BECount + 1) and all of the iterations of this
  143. // loop were executed by the prologue. Note that if BECount <u (Count - 1)
  144. // then (BECount + 1) cannot unsigned-overflow.
  145. Value *BrLoopExit =
  146. B.CreateICmpULT(BECount, ConstantInt::get(BECount->getType(), Count - 1));
  147. // Split the exit to maintain loop canonicalization guarantees
  148. SmallVector<BasicBlock *, 4> Preds(predecessors(OriginalLoopLatchExit));
  149. SplitBlockPredecessors(OriginalLoopLatchExit, Preds, ".unr-lcssa", DT, LI,
  150. nullptr, PreserveLCSSA);
  151. // Add the branch to the exit block (around the unrolled loop)
  152. B.CreateCondBr(BrLoopExit, OriginalLoopLatchExit, NewPreHeader);
  153. InsertPt->eraseFromParent();
  154. if (DT)
  155. DT->changeImmediateDominator(OriginalLoopLatchExit, PrologExit);
  156. }
  157. /// Connect the unrolling epilog code to the original loop.
  158. /// The unrolling epilog code contains code to execute the
  159. /// 'extra' iterations if the run-time trip count modulo the
  160. /// unroll count is non-zero.
  161. ///
  162. /// This function performs the following:
  163. /// - Update PHI nodes at the unrolling loop exit and epilog loop exit
  164. /// - Create PHI nodes at the unrolling loop exit to combine
  165. /// values that exit the unrolling loop code and jump around it.
  166. /// - Update PHI operands in the epilog loop by the new PHI nodes
  167. /// - Branch around the epilog loop if extra iters (ModVal) is zero.
  168. ///
  169. static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
  170. BasicBlock *Exit, BasicBlock *PreHeader,
  171. BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader,
  172. ValueToValueMapTy &VMap, DominatorTree *DT,
  173. LoopInfo *LI, bool PreserveLCSSA) {
  174. BasicBlock *Latch = L->getLoopLatch();
  175. assert(Latch && "Loop must have a latch");
  176. BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]);
  177. // Loop structure should be the following:
  178. //
  179. // PreHeader
  180. // NewPreHeader
  181. // Header
  182. // ...
  183. // Latch
  184. // NewExit (PN)
  185. // EpilogPreHeader
  186. // EpilogHeader
  187. // ...
  188. // EpilogLatch
  189. // Exit (EpilogPN)
  190. // Update PHI nodes at NewExit and Exit.
  191. for (PHINode &PN : NewExit->phis()) {
  192. // PN should be used in another PHI located in Exit block as
  193. // Exit was split by SplitBlockPredecessors into Exit and NewExit
  194. // Basicaly it should look like:
  195. // NewExit:
  196. // PN = PHI [I, Latch]
  197. // ...
  198. // Exit:
  199. // EpilogPN = PHI [PN, EpilogPreHeader]
  200. //
  201. // There is EpilogPreHeader incoming block instead of NewExit as
  202. // NewExit was spilt 1 more time to get EpilogPreHeader.
  203. assert(PN.hasOneUse() && "The phi should have 1 use");
  204. PHINode *EpilogPN = cast<PHINode>(PN.use_begin()->getUser());
  205. assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block");
  206. // Add incoming PreHeader from branch around the Loop
  207. PN.addIncoming(UndefValue::get(PN.getType()), PreHeader);
  208. Value *V = PN.getIncomingValueForBlock(Latch);
  209. Instruction *I = dyn_cast<Instruction>(V);
  210. if (I && L->contains(I))
  211. // If value comes from an instruction in the loop add VMap value.
  212. V = VMap.lookup(I);
  213. // For the instruction out of the loop, constant or undefined value
  214. // insert value itself.
  215. EpilogPN->addIncoming(V, EpilogLatch);
  216. assert(EpilogPN->getBasicBlockIndex(EpilogPreHeader) >= 0 &&
  217. "EpilogPN should have EpilogPreHeader incoming block");
  218. // Change EpilogPreHeader incoming block to NewExit.
  219. EpilogPN->setIncomingBlock(EpilogPN->getBasicBlockIndex(EpilogPreHeader),
  220. NewExit);
  221. // Now PHIs should look like:
  222. // NewExit:
  223. // PN = PHI [I, Latch], [undef, PreHeader]
  224. // ...
  225. // Exit:
  226. // EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch]
  227. }
  228. // Create PHI nodes at NewExit (from the unrolling loop Latch and PreHeader).
  229. // Update corresponding PHI nodes in epilog loop.
  230. for (BasicBlock *Succ : successors(Latch)) {
  231. // Skip this as we already updated phis in exit blocks.
  232. if (!L->contains(Succ))
  233. continue;
  234. for (PHINode &PN : Succ->phis()) {
  235. // Add new PHI nodes to the loop exit block and update epilog
  236. // PHIs with the new PHI values.
  237. PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr",
  238. NewExit->getFirstNonPHI());
  239. // Adding a value to the new PHI node from the unrolling loop preheader.
  240. NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader), PreHeader);
  241. // Adding a value to the new PHI node from the unrolling loop latch.
  242. NewPN->addIncoming(PN.getIncomingValueForBlock(Latch), Latch);
  243. // Update the existing PHI node operand with the value from the new PHI
  244. // node. Corresponding instruction in epilog loop should be PHI.
  245. PHINode *VPN = cast<PHINode>(VMap[&PN]);
  246. VPN->setIncomingValueForBlock(EpilogPreHeader, NewPN);
  247. }
  248. }
  249. Instruction *InsertPt = NewExit->getTerminator();
  250. IRBuilder<> B(InsertPt);
  251. Value *BrLoopExit = B.CreateIsNotNull(ModVal, "lcmp.mod");
  252. assert(Exit && "Loop must have a single exit block only");
  253. // Split the epilogue exit to maintain loop canonicalization guarantees
  254. SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
  255. SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI, nullptr,
  256. PreserveLCSSA);
  257. // Add the branch to the exit block (around the unrolling loop)
  258. B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit);
  259. InsertPt->eraseFromParent();
  260. if (DT)
  261. DT->changeImmediateDominator(Exit, NewExit);
  262. // Split the main loop exit to maintain canonicalization guarantees.
  263. SmallVector<BasicBlock*, 4> NewExitPreds{Latch};
  264. SplitBlockPredecessors(NewExit, NewExitPreds, ".loopexit", DT, LI, nullptr,
  265. PreserveLCSSA);
  266. }
  267. /// Create a clone of the blocks in a loop and connect them together.
  268. /// If CreateRemainderLoop is false, loop structure will not be cloned,
  269. /// otherwise a new loop will be created including all cloned blocks, and the
  270. /// iterator of it switches to count NewIter down to 0.
  271. /// The cloned blocks should be inserted between InsertTop and InsertBot.
  272. /// If loop structure is cloned InsertTop should be new preheader, InsertBot
  273. /// new loop exit.
  274. /// Return the new cloned loop that is created when CreateRemainderLoop is true.
  275. static Loop *
  276. CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop,
  277. const bool UseEpilogRemainder, const bool UnrollRemainder,
  278. BasicBlock *InsertTop,
  279. BasicBlock *InsertBot, BasicBlock *Preheader,
  280. std::vector<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
  281. ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI) {
  282. StringRef suffix = UseEpilogRemainder ? "epil" : "prol";
  283. BasicBlock *Header = L->getHeader();
  284. BasicBlock *Latch = L->getLoopLatch();
  285. Function *F = Header->getParent();
  286. LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
  287. LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
  288. Loop *ParentLoop = L->getParentLoop();
  289. NewLoopsMap NewLoops;
  290. NewLoops[ParentLoop] = ParentLoop;
  291. if (!CreateRemainderLoop)
  292. NewLoops[L] = ParentLoop;
  293. // For each block in the original loop, create a new copy,
  294. // and update the value map with the newly created values.
  295. for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
  296. BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F);
  297. NewBlocks.push_back(NewBB);
  298. // If we're unrolling the outermost loop, there's no remainder loop,
  299. // and this block isn't in a nested loop, then the new block is not
  300. // in any loop. Otherwise, add it to loopinfo.
  301. if (CreateRemainderLoop || LI->getLoopFor(*BB) != L || ParentLoop)
  302. addClonedBlockToLoopInfo(*BB, NewBB, LI, NewLoops);
  303. VMap[*BB] = NewBB;
  304. if (Header == *BB) {
  305. // For the first block, add a CFG connection to this newly
  306. // created block.
  307. InsertTop->getTerminator()->setSuccessor(0, NewBB);
  308. }
  309. if (DT) {
  310. if (Header == *BB) {
  311. // The header is dominated by the preheader.
  312. DT->addNewBlock(NewBB, InsertTop);
  313. } else {
  314. // Copy information from original loop to unrolled loop.
  315. BasicBlock *IDomBB = DT->getNode(*BB)->getIDom()->getBlock();
  316. DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB]));
  317. }
  318. }
  319. if (Latch == *BB) {
  320. // For the last block, if CreateRemainderLoop is false, create a direct
  321. // jump to InsertBot. If not, create a loop back to cloned head.
  322. VMap.erase((*BB)->getTerminator());
  323. BasicBlock *FirstLoopBB = cast<BasicBlock>(VMap[Header]);
  324. BranchInst *LatchBR = cast<BranchInst>(NewBB->getTerminator());
  325. IRBuilder<> Builder(LatchBR);
  326. if (!CreateRemainderLoop) {
  327. Builder.CreateBr(InsertBot);
  328. } else {
  329. PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2,
  330. suffix + ".iter",
  331. FirstLoopBB->getFirstNonPHI());
  332. Value *IdxSub =
  333. Builder.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
  334. NewIdx->getName() + ".sub");
  335. Value *IdxCmp =
  336. Builder.CreateIsNotNull(IdxSub, NewIdx->getName() + ".cmp");
  337. Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot);
  338. NewIdx->addIncoming(NewIter, InsertTop);
  339. NewIdx->addIncoming(IdxSub, NewBB);
  340. }
  341. LatchBR->eraseFromParent();
  342. }
  343. }
  344. // Change the incoming values to the ones defined in the preheader or
  345. // cloned loop.
  346. for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
  347. PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
  348. if (!CreateRemainderLoop) {
  349. if (UseEpilogRemainder) {
  350. unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
  351. NewPHI->setIncomingBlock(idx, InsertTop);
  352. NewPHI->removeIncomingValue(Latch, false);
  353. } else {
  354. VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader);
  355. cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
  356. }
  357. } else {
  358. unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
  359. NewPHI->setIncomingBlock(idx, InsertTop);
  360. BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]);
  361. idx = NewPHI->getBasicBlockIndex(Latch);
  362. Value *InVal = NewPHI->getIncomingValue(idx);
  363. NewPHI->setIncomingBlock(idx, NewLatch);
  364. if (Value *V = VMap.lookup(InVal))
  365. NewPHI->setIncomingValue(idx, V);
  366. }
  367. }
  368. if (CreateRemainderLoop) {
  369. Loop *NewLoop = NewLoops[L];
  370. assert(NewLoop && "L should have been cloned");
  371. MDNode *LoopID = NewLoop->getLoopID();
  372. // Only add loop metadata if the loop is not going to be completely
  373. // unrolled.
  374. if (UnrollRemainder)
  375. return NewLoop;
  376. Optional<MDNode *> NewLoopID = makeFollowupLoopID(
  377. LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder});
  378. if (NewLoopID.hasValue()) {
  379. NewLoop->setLoopID(NewLoopID.getValue());
  380. // Do not setLoopAlreadyUnrolled if loop attributes have been defined
  381. // explicitly.
  382. return NewLoop;
  383. }
  384. // Add unroll disable metadata to disable future unrolling for this loop.
  385. NewLoop->setLoopAlreadyUnrolled();
  386. return NewLoop;
  387. }
  388. else
  389. return nullptr;
  390. }
  391. /// Returns true if we can safely unroll a multi-exit/exiting loop. OtherExits
  392. /// is populated with all the loop exit blocks other than the LatchExit block.
  393. static bool canSafelyUnrollMultiExitLoop(Loop *L, BasicBlock *LatchExit,
  394. bool PreserveLCSSA,
  395. bool UseEpilogRemainder) {
  396. // We currently have some correctness constrains in unrolling a multi-exit
  397. // loop. Check for these below.
  398. // We rely on LCSSA form being preserved when the exit blocks are transformed.
  399. if (!PreserveLCSSA)
  400. return false;
  401. // TODO: Support multiple exiting blocks jumping to the `LatchExit` when
  402. // UnrollRuntimeMultiExit is true. This will need updating the logic in
  403. // connectEpilog/connectProlog.
  404. if (!LatchExit->getSinglePredecessor()) {
  405. LLVM_DEBUG(
  406. dbgs() << "Bailout for multi-exit handling when latch exit has >1 "
  407. "predecessor.\n");
  408. return false;
  409. }
  410. // FIXME: We bail out of multi-exit unrolling when epilog loop is generated
  411. // and L is an inner loop. This is because in presence of multiple exits, the
  412. // outer loop is incorrect: we do not add the EpilogPreheader and exit to the
  413. // outer loop. This is automatically handled in the prolog case, so we do not
  414. // have that bug in prolog generation.
  415. if (UseEpilogRemainder && L->getParentLoop())
  416. return false;
  417. // All constraints have been satisfied.
  418. return true;
  419. }
  420. /// Returns true if we can profitably unroll the multi-exit loop L. Currently,
  421. /// we return true only if UnrollRuntimeMultiExit is set to true.
  422. static bool canProfitablyUnrollMultiExitLoop(
  423. Loop *L, SmallVectorImpl<BasicBlock *> &OtherExits, BasicBlock *LatchExit,
  424. bool PreserveLCSSA, bool UseEpilogRemainder) {
  425. #if !defined(NDEBUG)
  426. assert(canSafelyUnrollMultiExitLoop(L, LatchExit, PreserveLCSSA,
  427. UseEpilogRemainder) &&
  428. "Should be safe to unroll before checking profitability!");
  429. #endif
  430. // Priority goes to UnrollRuntimeMultiExit if it's supplied.
  431. if (UnrollRuntimeMultiExit.getNumOccurrences())
  432. return UnrollRuntimeMultiExit;
  433. // The main pain point with multi-exit loop unrolling is that once unrolled,
  434. // we will not be able to merge all blocks into a straight line code.
  435. // There are branches within the unrolled loop that go to the OtherExits.
  436. // The second point is the increase in code size, but this is true
  437. // irrespective of multiple exits.
  438. // Note: Both the heuristics below are coarse grained. We are essentially
  439. // enabling unrolling of loops that have a single side exit other than the
  440. // normal LatchExit (i.e. exiting into a deoptimize block).
  441. // The heuristics considered are:
  442. // 1. low number of branches in the unrolled version.
  443. // 2. high predictability of these extra branches.
  444. // We avoid unrolling loops that have more than two exiting blocks. This
  445. // limits the total number of branches in the unrolled loop to be atmost
  446. // the unroll factor (since one of the exiting blocks is the latch block).
  447. SmallVector<BasicBlock*, 4> ExitingBlocks;
  448. L->getExitingBlocks(ExitingBlocks);
  449. if (ExitingBlocks.size() > 2)
  450. return false;
  451. // The second heuristic is that L has one exit other than the latchexit and
  452. // that exit is a deoptimize block. We know that deoptimize blocks are rarely
  453. // taken, which also implies the branch leading to the deoptimize block is
  454. // highly predictable.
  455. return (OtherExits.size() == 1 &&
  456. OtherExits[0]->getTerminatingDeoptimizeCall());
  457. // TODO: These can be fine-tuned further to consider code size or deopt states
  458. // that are captured by the deoptimize exit block.
  459. // Also, we can extend this to support more cases, if we actually
  460. // know of kinds of multiexit loops that would benefit from unrolling.
  461. }
  462. // Assign the maximum possible trip count as the back edge weight for the
  463. // remainder loop if the original loop comes with a branch weight.
  464. static void updateLatchBranchWeightsForRemainderLoop(Loop *OrigLoop,
  465. Loop *RemainderLoop,
  466. uint64_t UnrollFactor) {
  467. uint64_t TrueWeight, FalseWeight;
  468. BranchInst *LatchBR =
  469. cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
  470. if (LatchBR->extractProfMetadata(TrueWeight, FalseWeight)) {
  471. uint64_t ExitWeight = LatchBR->getSuccessor(0) == OrigLoop->getHeader()
  472. ? FalseWeight
  473. : TrueWeight;
  474. assert(UnrollFactor > 1);
  475. uint64_t BackEdgeWeight = (UnrollFactor - 1) * ExitWeight;
  476. BasicBlock *Header = RemainderLoop->getHeader();
  477. BasicBlock *Latch = RemainderLoop->getLoopLatch();
  478. auto *RemainderLatchBR = cast<BranchInst>(Latch->getTerminator());
  479. unsigned HeaderIdx = (RemainderLatchBR->getSuccessor(0) == Header ? 0 : 1);
  480. MDBuilder MDB(RemainderLatchBR->getContext());
  481. MDNode *WeightNode =
  482. HeaderIdx ? MDB.createBranchWeights(ExitWeight, BackEdgeWeight)
  483. : MDB.createBranchWeights(BackEdgeWeight, ExitWeight);
  484. RemainderLatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
  485. }
  486. }
  487. /// Insert code in the prolog/epilog code when unrolling a loop with a
  488. /// run-time trip-count.
  489. ///
  490. /// This method assumes that the loop unroll factor is total number
  491. /// of loop bodies in the loop after unrolling. (Some folks refer
  492. /// to the unroll factor as the number of *extra* copies added).
  493. /// We assume also that the loop unroll factor is a power-of-two. So, after
  494. /// unrolling the loop, the number of loop bodies executed is 2,
  495. /// 4, 8, etc. Note - LLVM converts the if-then-sequence to a switch
  496. /// instruction in SimplifyCFG.cpp. Then, the backend decides how code for
  497. /// the switch instruction is generated.
  498. ///
  499. /// ***Prolog case***
  500. /// extraiters = tripcount % loopfactor
  501. /// if (extraiters == 0) jump Loop:
  502. /// else jump Prol:
  503. /// Prol: LoopBody;
  504. /// extraiters -= 1 // Omitted if unroll factor is 2.
  505. /// if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2.
  506. /// if (tripcount < loopfactor) jump End:
  507. /// Loop:
  508. /// ...
  509. /// End:
  510. ///
  511. /// ***Epilog case***
  512. /// extraiters = tripcount % loopfactor
  513. /// if (tripcount < loopfactor) jump LoopExit:
  514. /// unroll_iters = tripcount - extraiters
  515. /// Loop: LoopBody; (executes unroll_iter times);
  516. /// unroll_iter -= 1
  517. /// if (unroll_iter != 0) jump Loop:
  518. /// LoopExit:
  519. /// if (extraiters == 0) jump EpilExit:
  520. /// Epil: LoopBody; (executes extraiters times)
  521. /// extraiters -= 1 // Omitted if unroll factor is 2.
  522. /// if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2.
  523. /// EpilExit:
  524. bool llvm::UnrollRuntimeLoopRemainder(
  525. Loop *L, unsigned Count, bool AllowExpensiveTripCount,
  526. bool UseEpilogRemainder, bool UnrollRemainder, bool ForgetAllSCEV,
  527. LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
  528. const TargetTransformInfo *TTI, bool PreserveLCSSA, Loop **ResultLoop) {
  529. LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
  530. LLVM_DEBUG(L->dump());
  531. LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n"
  532. : dbgs() << "Using prolog remainder.\n");
  533. // Make sure the loop is in canonical form.
  534. if (!L->isLoopSimplifyForm()) {
  535. LLVM_DEBUG(dbgs() << "Not in simplify form!\n");
  536. return false;
  537. }
  538. // Guaranteed by LoopSimplifyForm.
  539. BasicBlock *Latch = L->getLoopLatch();
  540. BasicBlock *Header = L->getHeader();
  541. BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
  542. if (!LatchBR || LatchBR->isUnconditional()) {
  543. // The loop-rotate pass can be helpful to avoid this in many cases.
  544. LLVM_DEBUG(
  545. dbgs()
  546. << "Loop latch not terminated by a conditional branch.\n");
  547. return false;
  548. }
  549. unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;
  550. BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex);
  551. if (L->contains(LatchExit)) {
  552. // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the
  553. // targets of the Latch be an exit block out of the loop.
  554. LLVM_DEBUG(
  555. dbgs()
  556. << "One of the loop latch successors must be the exit block.\n");
  557. return false;
  558. }
  559. // These are exit blocks other than the target of the latch exiting block.
  560. SmallVector<BasicBlock *, 4> OtherExits;
  561. L->getUniqueNonLatchExitBlocks(OtherExits);
  562. bool isMultiExitUnrollingEnabled =
  563. canSafelyUnrollMultiExitLoop(L, LatchExit, PreserveLCSSA,
  564. UseEpilogRemainder) &&
  565. canProfitablyUnrollMultiExitLoop(L, OtherExits, LatchExit, PreserveLCSSA,
  566. UseEpilogRemainder);
  567. // Support only single exit and exiting block unless multi-exit loop unrolling is enabled.
  568. if (!isMultiExitUnrollingEnabled &&
  569. (!L->getExitingBlock() || OtherExits.size())) {
  570. LLVM_DEBUG(
  571. dbgs()
  572. << "Multiple exit/exiting blocks in loop and multi-exit unrolling not "
  573. "enabled!\n");
  574. return false;
  575. }
  576. // Use Scalar Evolution to compute the trip count. This allows more loops to
  577. // be unrolled than relying on induction var simplification.
  578. if (!SE)
  579. return false;
  580. // Only unroll loops with a computable trip count, and the trip count needs
  581. // to be an int value (allowing a pointer type is a TODO item).
  582. // We calculate the backedge count by using getExitCount on the Latch block,
  583. // which is proven to be the only exiting block in this loop. This is same as
  584. // calculating getBackedgeTakenCount on the loop (which computes SCEV for all
  585. // exiting blocks).
  586. const SCEV *BECountSC = SE->getExitCount(L, Latch);
  587. if (isa<SCEVCouldNotCompute>(BECountSC) ||
  588. !BECountSC->getType()->isIntegerTy()) {
  589. LLVM_DEBUG(dbgs() << "Could not compute exit block SCEV\n");
  590. return false;
  591. }
  592. unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth();
  593. // Add 1 since the backedge count doesn't include the first loop iteration.
  594. const SCEV *TripCountSC =
  595. SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
  596. if (isa<SCEVCouldNotCompute>(TripCountSC)) {
  597. LLVM_DEBUG(dbgs() << "Could not compute trip count SCEV.\n");
  598. return false;
  599. }
  600. BasicBlock *PreHeader = L->getLoopPreheader();
  601. BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
  602. const DataLayout &DL = Header->getModule()->getDataLayout();
  603. SCEVExpander Expander(*SE, DL, "loop-unroll");
  604. if (!AllowExpensiveTripCount &&
  605. Expander.isHighCostExpansion(TripCountSC, L, SCEVCheapExpansionBudget,
  606. TTI, PreHeaderBR)) {
  607. LLVM_DEBUG(dbgs() << "High cost for expanding trip count scev!\n");
  608. return false;
  609. }
  610. // This constraint lets us deal with an overflowing trip count easily; see the
  611. // comment on ModVal below.
  612. if (Log2_32(Count) > BEWidth) {
  613. LLVM_DEBUG(
  614. dbgs()
  615. << "Count failed constraint on overflow trip count calculation.\n");
  616. return false;
  617. }
  618. // Loop structure is the following:
  619. //
  620. // PreHeader
  621. // Header
  622. // ...
  623. // Latch
  624. // LatchExit
  625. BasicBlock *NewPreHeader;
  626. BasicBlock *NewExit = nullptr;
  627. BasicBlock *PrologExit = nullptr;
  628. BasicBlock *EpilogPreHeader = nullptr;
  629. BasicBlock *PrologPreHeader = nullptr;
  630. if (UseEpilogRemainder) {
  631. // If epilog remainder
  632. // Split PreHeader to insert a branch around loop for unrolling.
  633. NewPreHeader = SplitBlock(PreHeader, PreHeader->getTerminator(), DT, LI);
  634. NewPreHeader->setName(PreHeader->getName() + ".new");
  635. // Split LatchExit to create phi nodes from branch above.
  636. SmallVector<BasicBlock*, 4> Preds(predecessors(LatchExit));
  637. NewExit = SplitBlockPredecessors(LatchExit, Preds, ".unr-lcssa", DT, LI,
  638. nullptr, PreserveLCSSA);
  639. // NewExit gets its DebugLoc from LatchExit, which is not part of the
  640. // original Loop.
  641. // Fix this by setting Loop's DebugLoc to NewExit.
  642. auto *NewExitTerminator = NewExit->getTerminator();
  643. NewExitTerminator->setDebugLoc(Header->getTerminator()->getDebugLoc());
  644. // Split NewExit to insert epilog remainder loop.
  645. EpilogPreHeader = SplitBlock(NewExit, NewExitTerminator, DT, LI);
  646. EpilogPreHeader->setName(Header->getName() + ".epil.preheader");
  647. } else {
  648. // If prolog remainder
  649. // Split the original preheader twice to insert prolog remainder loop
  650. PrologPreHeader = SplitEdge(PreHeader, Header, DT, LI);
  651. PrologPreHeader->setName(Header->getName() + ".prol.preheader");
  652. PrologExit = SplitBlock(PrologPreHeader, PrologPreHeader->getTerminator(),
  653. DT, LI);
  654. PrologExit->setName(Header->getName() + ".prol.loopexit");
  655. // Split PrologExit to get NewPreHeader.
  656. NewPreHeader = SplitBlock(PrologExit, PrologExit->getTerminator(), DT, LI);
  657. NewPreHeader->setName(PreHeader->getName() + ".new");
  658. }
  659. // Loop structure should be the following:
  660. // Epilog Prolog
  661. //
  662. // PreHeader PreHeader
  663. // *NewPreHeader *PrologPreHeader
  664. // Header *PrologExit
  665. // ... *NewPreHeader
  666. // Latch Header
  667. // *NewExit ...
  668. // *EpilogPreHeader Latch
  669. // LatchExit LatchExit
  670. // Calculate conditions for branch around loop for unrolling
  671. // in epilog case and around prolog remainder loop in prolog case.
  672. // Compute the number of extra iterations required, which is:
  673. // extra iterations = run-time trip count % loop unroll factor
  674. PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
  675. Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
  676. PreHeaderBR);
  677. Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(),
  678. PreHeaderBR);
  679. IRBuilder<> B(PreHeaderBR);
  680. Value *ModVal;
  681. // Calculate ModVal = (BECount + 1) % Count.
  682. // Note that TripCount is BECount + 1.
  683. if (isPowerOf2_32(Count)) {
  684. // When Count is power of 2 we don't BECount for epilog case, however we'll
  685. // need it for a branch around unrolling loop for prolog case.
  686. ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
  687. // 1. There are no iterations to be run in the prolog/epilog loop.
  688. // OR
  689. // 2. The addition computing TripCount overflowed.
  690. //
  691. // If (2) is true, we know that TripCount really is (1 << BEWidth) and so
  692. // the number of iterations that remain to be run in the original loop is a
  693. // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we
  694. // explicitly check this above).
  695. } else {
  696. // As (BECount + 1) can potentially unsigned overflow we count
  697. // (BECount % Count) + 1 which is overflow safe as BECount % Count < Count.
  698. Value *ModValTmp = B.CreateURem(BECount,
  699. ConstantInt::get(BECount->getType(),
  700. Count));
  701. Value *ModValAdd = B.CreateAdd(ModValTmp,
  702. ConstantInt::get(ModValTmp->getType(), 1));
  703. // At that point (BECount % Count) + 1 could be equal to Count.
  704. // To handle this case we need to take mod by Count one more time.
  705. ModVal = B.CreateURem(ModValAdd,
  706. ConstantInt::get(BECount->getType(), Count),
  707. "xtraiter");
  708. }
  709. Value *BranchVal =
  710. UseEpilogRemainder ? B.CreateICmpULT(BECount,
  711. ConstantInt::get(BECount->getType(),
  712. Count - 1)) :
  713. B.CreateIsNotNull(ModVal, "lcmp.mod");
  714. BasicBlock *RemainderLoop = UseEpilogRemainder ? NewExit : PrologPreHeader;
  715. BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit;
  716. // Branch to either remainder (extra iterations) loop or unrolling loop.
  717. B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop);
  718. PreHeaderBR->eraseFromParent();
  719. if (DT) {
  720. if (UseEpilogRemainder)
  721. DT->changeImmediateDominator(NewExit, PreHeader);
  722. else
  723. DT->changeImmediateDominator(PrologExit, PreHeader);
  724. }
  725. Function *F = Header->getParent();
  726. // Get an ordered list of blocks in the loop to help with the ordering of the
  727. // cloned blocks in the prolog/epilog code
  728. LoopBlocksDFS LoopBlocks(L);
  729. LoopBlocks.perform(LI);
  730. //
  731. // For each extra loop iteration, create a copy of the loop's basic blocks
  732. // and generate a condition that branches to the copy depending on the
  733. // number of 'left over' iterations.
  734. //
  735. std::vector<BasicBlock *> NewBlocks;
  736. ValueToValueMapTy VMap;
  737. // For unroll factor 2 remainder loop will have 1 iterations.
  738. // Do not create 1 iteration loop.
  739. bool CreateRemainderLoop = (Count != 2);
  740. // Clone all the basic blocks in the loop. If Count is 2, we don't clone
  741. // the loop, otherwise we create a cloned loop to execute the extra
  742. // iterations. This function adds the appropriate CFG connections.
  743. BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;
  744. BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
  745. Loop *remainderLoop = CloneLoopBlocks(
  746. L, ModVal, CreateRemainderLoop, UseEpilogRemainder, UnrollRemainder,
  747. InsertTop, InsertBot,
  748. NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI);
  749. // Assign the maximum possible trip count as the back edge weight for the
  750. // remainder loop if the original loop comes with a branch weight.
  751. if (remainderLoop && !UnrollRemainder)
  752. updateLatchBranchWeightsForRemainderLoop(L, remainderLoop, Count);
  753. // Insert the cloned blocks into the function.
  754. F->getBasicBlockList().splice(InsertBot->getIterator(),
  755. F->getBasicBlockList(),
  756. NewBlocks[0]->getIterator(),
  757. F->end());
  758. // Now the loop blocks are cloned and the other exiting blocks from the
  759. // remainder are connected to the original Loop's exit blocks. The remaining
  760. // work is to update the phi nodes in the original loop, and take in the
  761. // values from the cloned region.
  762. for (auto *BB : OtherExits) {
  763. for (auto &II : *BB) {
  764. // Given we preserve LCSSA form, we know that the values used outside the
  765. // loop will be used through these phi nodes at the exit blocks that are
  766. // transformed below.
  767. if (!isa<PHINode>(II))
  768. break;
  769. PHINode *Phi = cast<PHINode>(&II);
  770. unsigned oldNumOperands = Phi->getNumIncomingValues();
  771. // Add the incoming values from the remainder code to the end of the phi
  772. // node.
  773. for (unsigned i =0; i < oldNumOperands; i++){
  774. Value *newVal = VMap.lookup(Phi->getIncomingValue(i));
  775. // newVal can be a constant or derived from values outside the loop, and
  776. // hence need not have a VMap value. Also, since lookup already generated
  777. // a default "null" VMap entry for this value, we need to populate that
  778. // VMap entry correctly, with the mapped entry being itself.
  779. if (!newVal) {
  780. newVal = Phi->getIncomingValue(i);
  781. VMap[Phi->getIncomingValue(i)] = Phi->getIncomingValue(i);
  782. }
  783. Phi->addIncoming(newVal,
  784. cast<BasicBlock>(VMap[Phi->getIncomingBlock(i)]));
  785. }
  786. }
  787. #if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
  788. for (BasicBlock *SuccBB : successors(BB)) {
  789. assert(!(any_of(OtherExits,
  790. [SuccBB](BasicBlock *EB) { return EB == SuccBB; }) ||
  791. SuccBB == LatchExit) &&
  792. "Breaks the definition of dedicated exits!");
  793. }
  794. #endif
  795. }
  796. // Update the immediate dominator of the exit blocks and blocks that are
  797. // reachable from the exit blocks. This is needed because we now have paths
  798. // from both the original loop and the remainder code reaching the exit
  799. // blocks. While the IDom of these exit blocks were from the original loop,
  800. // now the IDom is the preheader (which decides whether the original loop or
  801. // remainder code should run).
  802. if (DT && !L->getExitingBlock()) {
  803. SmallVector<BasicBlock *, 16> ChildrenToUpdate;
  804. // NB! We have to examine the dom children of all loop blocks, not just
  805. // those which are the IDom of the exit blocks. This is because blocks
  806. // reachable from the exit blocks can have their IDom as the nearest common
  807. // dominator of the exit blocks.
  808. for (auto *BB : L->blocks()) {
  809. auto *DomNodeBB = DT->getNode(BB);
  810. for (auto *DomChild : DomNodeBB->children()) {
  811. auto *DomChildBB = DomChild->getBlock();
  812. if (!L->contains(LI->getLoopFor(DomChildBB)))
  813. ChildrenToUpdate.push_back(DomChildBB);
  814. }
  815. }
  816. for (auto *BB : ChildrenToUpdate)
  817. DT->changeImmediateDominator(BB, PreHeader);
  818. }
  819. // Loop structure should be the following:
  820. // Epilog Prolog
  821. //
  822. // PreHeader PreHeader
  823. // NewPreHeader PrologPreHeader
  824. // Header PrologHeader
  825. // ... ...
  826. // Latch PrologLatch
  827. // NewExit PrologExit
  828. // EpilogPreHeader NewPreHeader
  829. // EpilogHeader Header
  830. // ... ...
  831. // EpilogLatch Latch
  832. // LatchExit LatchExit
  833. // Rewrite the cloned instruction operands to use the values created when the
  834. // clone is created.
  835. for (BasicBlock *BB : NewBlocks) {
  836. for (Instruction &I : *BB) {
  837. RemapInstruction(&I, VMap,
  838. RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
  839. }
  840. }
  841. if (UseEpilogRemainder) {
  842. // Connect the epilog code to the original loop and update the
  843. // PHI functions.
  844. ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader,
  845. EpilogPreHeader, NewPreHeader, VMap, DT, LI,
  846. PreserveLCSSA);
  847. // Update counter in loop for unrolling.
  848. // I should be multiply of Count.
  849. IRBuilder<> B2(NewPreHeader->getTerminator());
  850. Value *TestVal = B2.CreateSub(TripCount, ModVal, "unroll_iter");
  851. BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
  852. B2.SetInsertPoint(LatchBR);
  853. PHINode *NewIdx = PHINode::Create(TestVal->getType(), 2, "niter",
  854. Header->getFirstNonPHI());
  855. Value *IdxSub =
  856. B2.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
  857. NewIdx->getName() + ".nsub");
  858. Value *IdxCmp;
  859. if (LatchBR->getSuccessor(0) == Header)
  860. IdxCmp = B2.CreateIsNotNull(IdxSub, NewIdx->getName() + ".ncmp");
  861. else
  862. IdxCmp = B2.CreateIsNull(IdxSub, NewIdx->getName() + ".ncmp");
  863. NewIdx->addIncoming(TestVal, NewPreHeader);
  864. NewIdx->addIncoming(IdxSub, Latch);
  865. LatchBR->setCondition(IdxCmp);
  866. } else {
  867. // Connect the prolog code to the original loop and update the
  868. // PHI functions.
  869. ConnectProlog(L, BECount, Count, PrologExit, LatchExit, PreHeader,
  870. NewPreHeader, VMap, DT, LI, PreserveLCSSA);
  871. }
  872. // If this loop is nested, then the loop unroller changes the code in the any
  873. // of its parent loops, so the Scalar Evolution pass needs to be run again.
  874. SE->forgetTopmostLoop(L);
  875. // Verify that the Dom Tree is correct.
  876. #if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
  877. if (DT)
  878. assert(DT->verify(DominatorTree::VerificationLevel::Full));
  879. #endif
  880. // Canonicalize to LoopSimplifyForm both original and remainder loops. We
  881. // cannot rely on the LoopUnrollPass to do this because it only does
  882. // canonicalization for parent/subloops and not the sibling loops.
  883. if (OtherExits.size() > 0) {
  884. // Generate dedicated exit blocks for the original loop, to preserve
  885. // LoopSimplifyForm.
  886. formDedicatedExitBlocks(L, DT, LI, nullptr, PreserveLCSSA);
  887. // Generate dedicated exit blocks for the remainder loop if one exists, to
  888. // preserve LoopSimplifyForm.
  889. if (remainderLoop)
  890. formDedicatedExitBlocks(remainderLoop, DT, LI, nullptr, PreserveLCSSA);
  891. }
  892. auto UnrollResult = LoopUnrollResult::Unmodified;
  893. if (remainderLoop && UnrollRemainder) {
  894. LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n");
  895. UnrollResult =
  896. UnrollLoop(remainderLoop,
  897. {/*Count*/ Count - 1, /*TripCount*/ Count - 1,
  898. /*Force*/ false, /*AllowRuntime*/ false,
  899. /*AllowExpensiveTripCount*/ false, /*PreserveCondBr*/ true,
  900. /*PreserveOnlyFirst*/ false, /*TripMultiple*/ 1,
  901. /*PeelCount*/ 0, /*UnrollRemainder*/ false, ForgetAllSCEV},
  902. LI, SE, DT, AC, TTI, /*ORE*/ nullptr, PreserveLCSSA);
  903. }
  904. if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled)
  905. *ResultLoop = remainderLoop;
  906. NumRuntimeUnrolled++;
  907. return true;
  908. }