LoopUnroll.cpp 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978
  1. //===-- UnrollLoop.cpp - Loop unrolling utilities -------------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file implements some loop unrolling utilities. It does not define any
  10. // actual pass or policy, but provides a single function to perform loop
  11. // unrolling.
  12. //
  13. // The process of unrolling can produce extraneous basic blocks linked with
  14. // unconditional branches. This will be corrected in the future.
  15. //
  16. //===----------------------------------------------------------------------===//
  17. #include "llvm/ADT/ArrayRef.h"
  18. #include "llvm/ADT/DenseMap.h"
  19. #include "llvm/ADT/Optional.h"
  20. #include "llvm/ADT/STLExtras.h"
  21. #include "llvm/ADT/SetVector.h"
  22. #include "llvm/ADT/SmallVector.h"
  23. #include "llvm/ADT/Statistic.h"
  24. #include "llvm/ADT/StringRef.h"
  25. #include "llvm/ADT/Twine.h"
  26. #include "llvm/ADT/ilist_iterator.h"
  27. #include "llvm/ADT/iterator_range.h"
  28. #include "llvm/Analysis/AssumptionCache.h"
  29. #include "llvm/Analysis/DomTreeUpdater.h"
  30. #include "llvm/Analysis/InstructionSimplify.h"
  31. #include "llvm/Analysis/LoopInfo.h"
  32. #include "llvm/Analysis/LoopIterator.h"
  33. #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  34. #include "llvm/Analysis/ScalarEvolution.h"
  35. #include "llvm/IR/BasicBlock.h"
  36. #include "llvm/IR/CFG.h"
  37. #include "llvm/IR/Constants.h"
  38. #include "llvm/IR/DebugInfoMetadata.h"
  39. #include "llvm/IR/DebugLoc.h"
  40. #include "llvm/IR/DiagnosticInfo.h"
  41. #include "llvm/IR/Dominators.h"
  42. #include "llvm/IR/Function.h"
  43. #include "llvm/IR/Instruction.h"
  44. #include "llvm/IR/Instructions.h"
  45. #include "llvm/IR/IntrinsicInst.h"
  46. #include "llvm/IR/Metadata.h"
  47. #include "llvm/IR/Module.h"
  48. #include "llvm/IR/Use.h"
  49. #include "llvm/IR/User.h"
  50. #include "llvm/IR/ValueHandle.h"
  51. #include "llvm/IR/ValueMap.h"
  52. #include "llvm/Support/Casting.h"
  53. #include "llvm/Support/CommandLine.h"
  54. #include "llvm/Support/Debug.h"
  55. #include "llvm/Support/GenericDomTree.h"
  56. #include "llvm/Support/MathExtras.h"
  57. #include "llvm/Support/raw_ostream.h"
  58. #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  59. #include "llvm/Transforms/Utils/Cloning.h"
  60. #include "llvm/Transforms/Utils/Local.h"
  61. #include "llvm/Transforms/Utils/LoopPeel.h"
  62. #include "llvm/Transforms/Utils/LoopSimplify.h"
  63. #include "llvm/Transforms/Utils/LoopUtils.h"
  64. #include "llvm/Transforms/Utils/SimplifyIndVar.h"
  65. #include "llvm/Transforms/Utils/UnrollLoop.h"
  66. #include "llvm/Transforms/Utils/ValueMapper.h"
  67. #include <algorithm>
  68. #include <assert.h>
  69. #include <type_traits>
  70. #include <vector>
  71. namespace llvm {
  72. class DataLayout;
  73. class Value;
  74. } // namespace llvm
  75. using namespace llvm;
  76. #define DEBUG_TYPE "loop-unroll"
  77. // TODO: Should these be here or in LoopUnroll?
  78. STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
  79. STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");
  80. STATISTIC(NumUnrolledNotLatch, "Number of loops unrolled without a conditional "
  81. "latch (completely or otherwise)");
  82. static cl::opt<bool>
  83. UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(false), cl::Hidden,
  84. cl::desc("Allow runtime unrolled loops to be unrolled "
  85. "with epilog instead of prolog."));
  86. static cl::opt<bool>
  87. UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden,
  88. cl::desc("Verify domtree after unrolling"),
  89. #ifdef EXPENSIVE_CHECKS
  90. cl::init(true)
  91. #else
  92. cl::init(false)
  93. #endif
  94. );
  95. /// Check if unrolling created a situation where we need to insert phi nodes to
  96. /// preserve LCSSA form.
  97. /// \param Blocks is a vector of basic blocks representing unrolled loop.
  98. /// \param L is the outer loop.
  99. /// It's possible that some of the blocks are in L, and some are not. In this
  100. /// case, if there is a use is outside L, and definition is inside L, we need to
  101. /// insert a phi-node, otherwise LCSSA will be broken.
  102. /// The function is just a helper function for llvm::UnrollLoop that returns
  103. /// true if this situation occurs, indicating that LCSSA needs to be fixed.
  104. static bool needToInsertPhisForLCSSA(Loop *L,
  105. const std::vector<BasicBlock *> &Blocks,
  106. LoopInfo *LI) {
  107. for (BasicBlock *BB : Blocks) {
  108. if (LI->getLoopFor(BB) == L)
  109. continue;
  110. for (Instruction &I : *BB) {
  111. for (Use &U : I.operands()) {
  112. if (const auto *Def = dyn_cast<Instruction>(U)) {
  113. Loop *DefLoop = LI->getLoopFor(Def->getParent());
  114. if (!DefLoop)
  115. continue;
  116. if (DefLoop->contains(L))
  117. return true;
  118. }
  119. }
  120. }
  121. }
  122. return false;
  123. }
  124. /// Adds ClonedBB to LoopInfo, creates a new loop for ClonedBB if necessary
  125. /// and adds a mapping from the original loop to the new loop to NewLoops.
  126. /// Returns nullptr if no new loop was created and a pointer to the
  127. /// original loop OriginalBB was part of otherwise.
  128. const Loop* llvm::addClonedBlockToLoopInfo(BasicBlock *OriginalBB,
  129. BasicBlock *ClonedBB, LoopInfo *LI,
  130. NewLoopsMap &NewLoops) {
  131. // Figure out which loop New is in.
  132. const Loop *OldLoop = LI->getLoopFor(OriginalBB);
  133. assert(OldLoop && "Should (at least) be in the loop being unrolled!");
  134. Loop *&NewLoop = NewLoops[OldLoop];
  135. if (!NewLoop) {
  136. // Found a new sub-loop.
  137. assert(OriginalBB == OldLoop->getHeader() &&
  138. "Header should be first in RPO");
  139. NewLoop = LI->AllocateLoop();
  140. Loop *NewLoopParent = NewLoops.lookup(OldLoop->getParentLoop());
  141. if (NewLoopParent)
  142. NewLoopParent->addChildLoop(NewLoop);
  143. else
  144. LI->addTopLevelLoop(NewLoop);
  145. NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
  146. return OldLoop;
  147. } else {
  148. NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
  149. return nullptr;
  150. }
  151. }
  152. /// The function chooses which type of unroll (epilog or prolog) is more
  153. /// profitabale.
  154. /// Epilog unroll is more profitable when there is PHI that starts from
  155. /// constant. In this case epilog will leave PHI start from constant,
  156. /// but prolog will convert it to non-constant.
  157. ///
  158. /// loop:
  159. /// PN = PHI [I, Latch], [CI, PreHeader]
  160. /// I = foo(PN)
  161. /// ...
  162. ///
  163. /// Epilog unroll case.
  164. /// loop:
  165. /// PN = PHI [I2, Latch], [CI, PreHeader]
  166. /// I1 = foo(PN)
  167. /// I2 = foo(I1)
  168. /// ...
  169. /// Prolog unroll case.
  170. /// NewPN = PHI [PrologI, Prolog], [CI, PreHeader]
  171. /// loop:
  172. /// PN = PHI [I2, Latch], [NewPN, PreHeader]
  173. /// I1 = foo(PN)
  174. /// I2 = foo(I1)
  175. /// ...
  176. ///
  177. static bool isEpilogProfitable(Loop *L) {
  178. BasicBlock *PreHeader = L->getLoopPreheader();
  179. BasicBlock *Header = L->getHeader();
  180. assert(PreHeader && Header);
  181. for (const PHINode &PN : Header->phis()) {
  182. if (isa<ConstantInt>(PN.getIncomingValueForBlock(PreHeader)))
  183. return true;
  184. }
  185. return false;
  186. }
  187. /// Perform some cleanup and simplifications on loops after unrolling. It is
  188. /// useful to simplify the IV's in the new loop, as well as do a quick
  189. /// simplify/dce pass of the instructions.
  190. void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
  191. ScalarEvolution *SE, DominatorTree *DT,
  192. AssumptionCache *AC,
  193. const TargetTransformInfo *TTI) {
  194. // Simplify any new induction variables in the partially unrolled loop.
  195. if (SE && SimplifyIVs) {
  196. SmallVector<WeakTrackingVH, 16> DeadInsts;
  197. simplifyLoopIVs(L, SE, DT, LI, TTI, DeadInsts);
  198. // Aggressively clean up dead instructions that simplifyLoopIVs already
  199. // identified. Any remaining should be cleaned up below.
  200. while (!DeadInsts.empty()) {
  201. Value *V = DeadInsts.pop_back_val();
  202. if (Instruction *Inst = dyn_cast_or_null<Instruction>(V))
  203. RecursivelyDeleteTriviallyDeadInstructions(Inst);
  204. }
  205. }
  206. // At this point, the code is well formed. We now do a quick sweep over the
  207. // inserted code, doing constant propagation and dead code elimination as we
  208. // go.
  209. const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
  210. for (BasicBlock *BB : L->getBlocks()) {
  211. for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
  212. Instruction *Inst = &*I++;
  213. if (Value *V = SimplifyInstruction(Inst, {DL, nullptr, DT, AC}))
  214. if (LI->replacementPreservesLCSSAForm(Inst, V))
  215. Inst->replaceAllUsesWith(V);
  216. if (isInstructionTriviallyDead(Inst))
  217. BB->getInstList().erase(Inst);
  218. }
  219. }
  220. // TODO: after peeling or unrolling, previously loop variant conditions are
  221. // likely to fold to constants, eagerly propagating those here will require
  222. // fewer cleanup passes to be run. Alternatively, a LoopEarlyCSE might be
  223. // appropriate.
  224. }
  225. /// Unroll the given loop by Count. The loop must be in LCSSA form. Unrolling
  226. /// can only fail when the loop's latch block is not terminated by a conditional
  227. /// branch instruction. However, if the trip count (and multiple) are not known,
  228. /// loop unrolling will mostly produce more code that is no faster.
  229. ///
  230. /// TripCount is the upper bound of the iteration on which control exits
  231. /// LatchBlock. Control may exit the loop prior to TripCount iterations either
  232. /// via an early branch in other loop block or via LatchBlock terminator. This
  233. /// is relaxed from the general definition of trip count which is the number of
  234. /// times the loop header executes. Note that UnrollLoop assumes that the loop
  235. /// counter test is in LatchBlock in order to remove unnecesssary instances of
  236. /// the test. If control can exit the loop from the LatchBlock's terminator
  237. /// prior to TripCount iterations, flag PreserveCondBr needs to be set.
  238. ///
  239. /// PreserveCondBr indicates whether the conditional branch of the LatchBlock
  240. /// needs to be preserved. It is needed when we use trip count upper bound to
  241. /// fully unroll the loop. If PreserveOnlyFirst is also set then only the first
  242. /// conditional branch needs to be preserved.
  243. ///
  244. /// Similarly, TripMultiple divides the number of times that the LatchBlock may
  245. /// execute without exiting the loop.
  246. ///
  247. /// If AllowRuntime is true then UnrollLoop will consider unrolling loops that
  248. /// have a runtime (i.e. not compile time constant) trip count. Unrolling these
  249. /// loops require a unroll "prologue" that runs "RuntimeTripCount % Count"
  250. /// iterations before branching into the unrolled loop. UnrollLoop will not
  251. /// runtime-unroll the loop if computing RuntimeTripCount will be expensive and
  252. /// AllowExpensiveTripCount is false.
  253. ///
  254. /// If we want to perform PGO-based loop peeling, PeelCount is set to the
  255. /// number of iterations we want to peel off.
  256. ///
  257. /// The LoopInfo Analysis that is passed will be kept consistent.
  258. ///
  259. /// This utility preserves LoopInfo. It will also preserve ScalarEvolution and
  260. /// DominatorTree if they are non-null.
  261. ///
  262. /// If RemainderLoop is non-null, it will receive the remainder loop (if
  263. /// required and not fully unrolled).
  264. LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
  265. ScalarEvolution *SE, DominatorTree *DT,
  266. AssumptionCache *AC,
  267. const TargetTransformInfo *TTI,
  268. OptimizationRemarkEmitter *ORE,
  269. bool PreserveLCSSA, Loop **RemainderLoop) {
  270. if (!L->getLoopPreheader()) {
  271. LLVM_DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n");
  272. return LoopUnrollResult::Unmodified;
  273. }
  274. if (!L->getLoopLatch()) {
  275. LLVM_DEBUG(dbgs() << " Can't unroll; loop exit-block-insertion failed.\n");
  276. return LoopUnrollResult::Unmodified;
  277. }
  278. // Loops with indirectbr cannot be cloned.
  279. if (!L->isSafeToClone()) {
  280. LLVM_DEBUG(dbgs() << " Can't unroll; Loop body cannot be cloned.\n");
  281. return LoopUnrollResult::Unmodified;
  282. }
  283. if (L->getHeader()->hasAddressTaken()) {
  284. // The loop-rotate pass can be helpful to avoid this in many cases.
  285. LLVM_DEBUG(
  286. dbgs() << " Won't unroll loop: address of header block is taken.\n");
  287. return LoopUnrollResult::Unmodified;
  288. }
  289. if (ULO.TripCount != 0)
  290. LLVM_DEBUG(dbgs() << " Trip Count = " << ULO.TripCount << "\n");
  291. if (ULO.TripMultiple != 1)
  292. LLVM_DEBUG(dbgs() << " Trip Multiple = " << ULO.TripMultiple << "\n");
  293. // Effectively "DCE" unrolled iterations that are beyond the tripcount
  294. // and will never be executed.
  295. if (ULO.TripCount != 0 && ULO.Count > ULO.TripCount)
  296. ULO.Count = ULO.TripCount;
  297. // Don't enter the unroll code if there is nothing to do.
  298. if (ULO.TripCount == 0 && ULO.Count < 2 && ULO.PeelCount == 0) {
  299. LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
  300. return LoopUnrollResult::Unmodified;
  301. }
  302. assert(ULO.Count > 0);
  303. assert(ULO.TripMultiple > 0);
  304. assert(ULO.TripCount == 0 || ULO.TripCount % ULO.TripMultiple == 0);
  305. // Are we eliminating the loop control altogether?
  306. bool CompletelyUnroll = ULO.Count == ULO.TripCount;
  307. // We assume a run-time trip count if the compiler cannot
  308. // figure out the loop trip count and the unroll-runtime
  309. // flag is specified.
  310. bool RuntimeTripCount =
  311. (ULO.TripCount == 0 && ULO.Count > 0 && ULO.AllowRuntime);
  312. assert((!RuntimeTripCount || !ULO.PeelCount) &&
  313. "Did not expect runtime trip-count unrolling "
  314. "and peeling for the same loop");
  315. bool Peeled = false;
  316. if (ULO.PeelCount) {
  317. Peeled = peelLoop(L, ULO.PeelCount, LI, SE, DT, AC, PreserveLCSSA);
  318. // Successful peeling may result in a change in the loop preheader/trip
  319. // counts. If we later unroll the loop, we want these to be updated.
  320. if (Peeled) {
  321. // According to our guards and profitability checks the only
  322. // meaningful exit should be latch block. Other exits go to deopt,
  323. // so we do not worry about them.
  324. BasicBlock *ExitingBlock = L->getLoopLatch();
  325. assert(ExitingBlock && "Loop without exiting block?");
  326. assert(L->isLoopExiting(ExitingBlock) && "Latch is not exiting?");
  327. ULO.TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
  328. ULO.TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
  329. }
  330. }
  331. // All these values should be taken only after peeling because they might have
  332. // changed.
  333. BasicBlock *Preheader = L->getLoopPreheader();
  334. BasicBlock *Header = L->getHeader();
  335. BasicBlock *LatchBlock = L->getLoopLatch();
  336. SmallVector<BasicBlock *, 4> ExitBlocks;
  337. L->getExitBlocks(ExitBlocks);
  338. std::vector<BasicBlock *> OriginalLoopBlocks = L->getBlocks();
  339. // Go through all exits of L and see if there are any phi-nodes there. We just
  340. // conservatively assume that they're inserted to preserve LCSSA form, which
  341. // means that complete unrolling might break this form. We need to either fix
  342. // it in-place after the transformation, or entirely rebuild LCSSA. TODO: For
  343. // now we just recompute LCSSA for the outer loop, but it should be possible
  344. // to fix it in-place.
  345. bool NeedToFixLCSSA =
  346. PreserveLCSSA && CompletelyUnroll &&
  347. any_of(ExitBlocks,
  348. [](const BasicBlock *BB) { return isa<PHINode>(BB->begin()); });
  349. // The current loop unroll pass can unroll loops that have
  350. // (1) single latch; and
  351. // (2a) latch is unconditional; or
  352. // (2b) latch is conditional and is an exiting block
  353. // FIXME: The implementation can be extended to work with more complicated
  354. // cases, e.g. loops with multiple latches.
  355. BranchInst *LatchBI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
  356. // A conditional branch which exits the loop, which can be optimized to an
  357. // unconditional branch in the unrolled loop in some cases.
  358. BranchInst *ExitingBI = nullptr;
  359. bool LatchIsExiting = L->isLoopExiting(LatchBlock);
  360. if (LatchIsExiting)
  361. ExitingBI = LatchBI;
  362. else if (BasicBlock *ExitingBlock = L->getExitingBlock())
  363. ExitingBI = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
  364. if (!LatchBI || (LatchBI->isConditional() && !LatchIsExiting)) {
  365. // If the peeling guard is changed this assert may be relaxed or even
  366. // deleted.
  367. assert(!Peeled && "Peeling guard changed!");
  368. LLVM_DEBUG(
  369. dbgs() << "Can't unroll; a conditional latch must exit the loop");
  370. return LoopUnrollResult::Unmodified;
  371. }
  372. LLVM_DEBUG({
  373. if (ExitingBI)
  374. dbgs() << " Exiting Block = " << ExitingBI->getParent()->getName()
  375. << "\n";
  376. else
  377. dbgs() << " No single exiting block\n";
  378. });
  379. // Loops containing convergent instructions must have a count that divides
  380. // their TripMultiple.
  381. LLVM_DEBUG(
  382. {
  383. bool HasConvergent = false;
  384. for (auto &BB : L->blocks())
  385. for (auto &I : *BB)
  386. if (auto *CB = dyn_cast<CallBase>(&I))
  387. HasConvergent |= CB->isConvergent();
  388. assert((!HasConvergent || ULO.TripMultiple % ULO.Count == 0) &&
  389. "Unroll count must divide trip multiple if loop contains a "
  390. "convergent operation.");
  391. });
  392. bool EpilogProfitability =
  393. UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog
  394. : isEpilogProfitable(L);
  395. if (RuntimeTripCount && ULO.TripMultiple % ULO.Count != 0 &&
  396. !UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount,
  397. EpilogProfitability, ULO.UnrollRemainder,
  398. ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
  399. PreserveLCSSA, RemainderLoop)) {
  400. if (ULO.Force)
  401. RuntimeTripCount = false;
  402. else {
  403. LLVM_DEBUG(dbgs() << "Won't unroll; remainder loop could not be "
  404. "generated when assuming runtime trip count\n");
  405. return LoopUnrollResult::Unmodified;
  406. }
  407. }
  408. // If we know the trip count, we know the multiple...
  409. unsigned BreakoutTrip = 0;
  410. if (ULO.TripCount != 0) {
  411. BreakoutTrip = ULO.TripCount % ULO.Count;
  412. ULO.TripMultiple = 0;
  413. } else {
  414. // Figure out what multiple to use.
  415. BreakoutTrip = ULO.TripMultiple =
  416. (unsigned)GreatestCommonDivisor64(ULO.Count, ULO.TripMultiple);
  417. }
  418. using namespace ore;
  419. // Report the unrolling decision.
  420. if (CompletelyUnroll) {
  421. LLVM_DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()
  422. << " with trip count " << ULO.TripCount << "!\n");
  423. if (ORE)
  424. ORE->emit([&]() {
  425. return OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
  426. L->getHeader())
  427. << "completely unrolled loop with "
  428. << NV("UnrollCount", ULO.TripCount) << " iterations";
  429. });
  430. } else if (ULO.PeelCount) {
  431. LLVM_DEBUG(dbgs() << "PEELING loop %" << Header->getName()
  432. << " with iteration count " << ULO.PeelCount << "!\n");
  433. if (ORE)
  434. ORE->emit([&]() {
  435. return OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(),
  436. L->getHeader())
  437. << " peeled loop by " << NV("PeelCount", ULO.PeelCount)
  438. << " iterations";
  439. });
  440. } else {
  441. auto DiagBuilder = [&]() {
  442. OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
  443. L->getHeader());
  444. return Diag << "unrolled loop by a factor of "
  445. << NV("UnrollCount", ULO.Count);
  446. };
  447. LLVM_DEBUG(dbgs() << "UNROLLING loop %" << Header->getName() << " by "
  448. << ULO.Count);
  449. if (ULO.TripMultiple == 0 || BreakoutTrip != ULO.TripMultiple) {
  450. LLVM_DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip);
  451. if (ORE)
  452. ORE->emit([&]() {
  453. return DiagBuilder() << " with a breakout at trip "
  454. << NV("BreakoutTrip", BreakoutTrip);
  455. });
  456. } else if (ULO.TripMultiple != 1) {
  457. LLVM_DEBUG(dbgs() << " with " << ULO.TripMultiple << " trips per branch");
  458. if (ORE)
  459. ORE->emit([&]() {
  460. return DiagBuilder()
  461. << " with " << NV("TripMultiple", ULO.TripMultiple)
  462. << " trips per branch";
  463. });
  464. } else if (RuntimeTripCount) {
  465. LLVM_DEBUG(dbgs() << " with run-time trip count");
  466. if (ORE)
  467. ORE->emit(
  468. [&]() { return DiagBuilder() << " with run-time trip count"; });
  469. }
  470. LLVM_DEBUG(dbgs() << "!\n");
  471. }
  472. // We are going to make changes to this loop. SCEV may be keeping cached info
  473. // about it, in particular about backedge taken count. The changes we make
  474. // are guaranteed to invalidate this information for our loop. It is tempting
  475. // to only invalidate the loop being unrolled, but it is incorrect as long as
  476. // all exiting branches from all inner loops have impact on the outer loops,
  477. // and if something changes inside them then any of outer loops may also
  478. // change. When we forget outermost loop, we also forget all contained loops
  479. // and this is what we need here.
  480. if (SE) {
  481. if (ULO.ForgetAllSCEV)
  482. SE->forgetAllLoops();
  483. else
  484. SE->forgetTopmostLoop(L);
  485. }
  486. if (!LatchIsExiting)
  487. ++NumUnrolledNotLatch;
  488. Optional<bool> ContinueOnTrue = None;
  489. BasicBlock *LoopExit = nullptr;
  490. if (ExitingBI) {
  491. ContinueOnTrue = L->contains(ExitingBI->getSuccessor(0));
  492. LoopExit = ExitingBI->getSuccessor(*ContinueOnTrue);
  493. }
  494. // For the first iteration of the loop, we should use the precloned values for
  495. // PHI nodes. Insert associations now.
  496. ValueToValueMapTy LastValueMap;
  497. std::vector<PHINode*> OrigPHINode;
  498. for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
  499. OrigPHINode.push_back(cast<PHINode>(I));
  500. }
  501. std::vector<BasicBlock *> Headers;
  502. std::vector<BasicBlock *> ExitingBlocks;
  503. std::vector<BasicBlock *> ExitingSucc;
  504. std::vector<BasicBlock *> Latches;
  505. Headers.push_back(Header);
  506. Latches.push_back(LatchBlock);
  507. if (ExitingBI) {
  508. ExitingBlocks.push_back(ExitingBI->getParent());
  509. ExitingSucc.push_back(ExitingBI->getSuccessor(!(*ContinueOnTrue)));
  510. }
  511. // The current on-the-fly SSA update requires blocks to be processed in
  512. // reverse postorder so that LastValueMap contains the correct value at each
  513. // exit.
  514. LoopBlocksDFS DFS(L);
  515. DFS.perform(LI);
  516. // Stash the DFS iterators before adding blocks to the loop.
  517. LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
  518. LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
  519. std::vector<BasicBlock*> UnrolledLoopBlocks = L->getBlocks();
  520. // Loop Unrolling might create new loops. While we do preserve LoopInfo, we
  521. // might break loop-simplified form for these loops (as they, e.g., would
  522. // share the same exit blocks). We'll keep track of loops for which we can
  523. // break this so that later we can re-simplify them.
  524. SmallSetVector<Loop *, 4> LoopsToSimplify;
  525. for (Loop *SubLoop : *L)
  526. LoopsToSimplify.insert(SubLoop);
  527. if (Header->getParent()->isDebugInfoForProfiling())
  528. for (BasicBlock *BB : L->getBlocks())
  529. for (Instruction &I : *BB)
  530. if (!isa<DbgInfoIntrinsic>(&I))
  531. if (const DILocation *DIL = I.getDebugLoc()) {
  532. auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(ULO.Count);
  533. if (NewDIL)
  534. I.setDebugLoc(NewDIL.getValue());
  535. else
  536. LLVM_DEBUG(dbgs()
  537. << "Failed to create new discriminator: "
  538. << DIL->getFilename() << " Line: " << DIL->getLine());
  539. }
  540. // Identify what noalias metadata is inside the loop: if it is inside the
  541. // loop, the associated metadata must be cloned for each iteration.
  542. SmallVector<MDNode *, 6> LoopLocalNoAliasDeclScopes;
  543. identifyNoAliasScopesToClone(L->getBlocks(), LoopLocalNoAliasDeclScopes);
  544. for (unsigned It = 1; It != ULO.Count; ++It) {
  545. SmallVector<BasicBlock *, 8> NewBlocks;
  546. SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
  547. NewLoops[L] = L;
  548. for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
  549. ValueToValueMapTy VMap;
  550. BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
  551. Header->getParent()->getBasicBlockList().push_back(New);
  552. assert((*BB != Header || LI->getLoopFor(*BB) == L) &&
  553. "Header should not be in a sub-loop");
  554. // Tell LI about New.
  555. const Loop *OldLoop = addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
  556. if (OldLoop)
  557. LoopsToSimplify.insert(NewLoops[OldLoop]);
  558. if (*BB == Header)
  559. // Loop over all of the PHI nodes in the block, changing them to use
  560. // the incoming values from the previous block.
  561. for (PHINode *OrigPHI : OrigPHINode) {
  562. PHINode *NewPHI = cast<PHINode>(VMap[OrigPHI]);
  563. Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
  564. if (Instruction *InValI = dyn_cast<Instruction>(InVal))
  565. if (It > 1 && L->contains(InValI))
  566. InVal = LastValueMap[InValI];
  567. VMap[OrigPHI] = InVal;
  568. New->getInstList().erase(NewPHI);
  569. }
  570. // Update our running map of newest clones
  571. LastValueMap[*BB] = New;
  572. for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
  573. VI != VE; ++VI)
  574. LastValueMap[VI->first] = VI->second;
  575. // Add phi entries for newly created values to all exit blocks.
  576. for (BasicBlock *Succ : successors(*BB)) {
  577. if (L->contains(Succ))
  578. continue;
  579. for (PHINode &PHI : Succ->phis()) {
  580. Value *Incoming = PHI.getIncomingValueForBlock(*BB);
  581. ValueToValueMapTy::iterator It = LastValueMap.find(Incoming);
  582. if (It != LastValueMap.end())
  583. Incoming = It->second;
  584. PHI.addIncoming(Incoming, New);
  585. }
  586. }
  587. // Keep track of new headers and latches as we create them, so that
  588. // we can insert the proper branches later.
  589. if (*BB == Header)
  590. Headers.push_back(New);
  591. if (*BB == LatchBlock)
  592. Latches.push_back(New);
  593. // Keep track of the exiting block and its successor block contained in
  594. // the loop for the current iteration.
  595. if (ExitingBI) {
  596. if (*BB == ExitingBlocks[0])
  597. ExitingBlocks.push_back(New);
  598. if (*BB == ExitingSucc[0])
  599. ExitingSucc.push_back(New);
  600. }
  601. NewBlocks.push_back(New);
  602. UnrolledLoopBlocks.push_back(New);
  603. // Update DomTree: since we just copy the loop body, and each copy has a
  604. // dedicated entry block (copy of the header block), this header's copy
  605. // dominates all copied blocks. That means, dominance relations in the
  606. // copied body are the same as in the original body.
  607. if (DT) {
  608. if (*BB == Header)
  609. DT->addNewBlock(New, Latches[It - 1]);
  610. else {
  611. auto BBDomNode = DT->getNode(*BB);
  612. auto BBIDom = BBDomNode->getIDom();
  613. BasicBlock *OriginalBBIDom = BBIDom->getBlock();
  614. DT->addNewBlock(
  615. New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
  616. }
  617. }
  618. }
  619. // Remap all instructions in the most recent iteration
  620. remapInstructionsInBlocks(NewBlocks, LastValueMap);
  621. for (BasicBlock *NewBlock : NewBlocks) {
  622. for (Instruction &I : *NewBlock) {
  623. if (auto *II = dyn_cast<IntrinsicInst>(&I))
  624. if (II->getIntrinsicID() == Intrinsic::assume)
  625. AC->registerAssumption(II);
  626. }
  627. }
  628. {
  629. // Identify what other metadata depends on the cloned version. After
  630. // cloning, replace the metadata with the corrected version for both
  631. // memory instructions and noalias intrinsics.
  632. std::string ext = (Twine("It") + Twine(It)).str();
  633. cloneAndAdaptNoAliasScopes(LoopLocalNoAliasDeclScopes, NewBlocks,
  634. Header->getContext(), ext);
  635. }
  636. }
  637. // Loop over the PHI nodes in the original block, setting incoming values.
  638. for (PHINode *PN : OrigPHINode) {
  639. if (CompletelyUnroll) {
  640. PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
  641. Header->getInstList().erase(PN);
  642. } else if (ULO.Count > 1) {
  643. Value *InVal = PN->removeIncomingValue(LatchBlock, false);
  644. // If this value was defined in the loop, take the value defined by the
  645. // last iteration of the loop.
  646. if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {
  647. if (L->contains(InValI))
  648. InVal = LastValueMap[InVal];
  649. }
  650. assert(Latches.back() == LastValueMap[LatchBlock] && "bad last latch");
  651. PN->addIncoming(InVal, Latches.back());
  652. }
  653. }
  654. auto setDest = [](BasicBlock *Src, BasicBlock *Dest, BasicBlock *BlockInLoop,
  655. bool NeedConditional, Optional<bool> ContinueOnTrue,
  656. bool IsDestLoopExit) {
  657. auto *Term = cast<BranchInst>(Src->getTerminator());
  658. if (NeedConditional) {
  659. // Update the conditional branch's successor for the following
  660. // iteration.
  661. assert(ContinueOnTrue.hasValue() &&
  662. "Expecting valid ContinueOnTrue when NeedConditional is true");
  663. Term->setSuccessor(!(*ContinueOnTrue), Dest);
  664. } else {
  665. // Remove phi operands at this loop exit
  666. if (!IsDestLoopExit) {
  667. BasicBlock *BB = Src;
  668. for (BasicBlock *Succ : successors(BB)) {
  669. // Preserve the incoming value from BB if we are jumping to the block
  670. // in the current loop.
  671. if (Succ == BlockInLoop)
  672. continue;
  673. for (PHINode &Phi : Succ->phis())
  674. Phi.removeIncomingValue(BB, false);
  675. }
  676. }
  677. // Replace the conditional branch with an unconditional one.
  678. BranchInst::Create(Dest, Term);
  679. Term->eraseFromParent();
  680. }
  681. };
  682. // Connect latches of the unrolled iterations to the headers of the next
  683. // iteration. If the latch is also the exiting block, the conditional branch
  684. // may have to be preserved.
  685. for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
  686. // The branch destination.
  687. unsigned j = (i + 1) % e;
  688. BasicBlock *Dest = Headers[j];
  689. bool NeedConditional = LatchIsExiting;
  690. if (LatchIsExiting) {
  691. if (RuntimeTripCount && j != 0)
  692. NeedConditional = false;
  693. // For a complete unroll, make the last iteration end with a branch
  694. // to the exit block.
  695. if (CompletelyUnroll) {
  696. if (j == 0)
  697. Dest = LoopExit;
  698. // If using trip count upper bound to completely unroll, we need to
  699. // keep the conditional branch except the last one because the loop
  700. // may exit after any iteration.
  701. assert(NeedConditional &&
  702. "NeedCondition cannot be modified by both complete "
  703. "unrolling and runtime unrolling");
  704. NeedConditional =
  705. (ULO.PreserveCondBr && j && !(ULO.PreserveOnlyFirst && i != 0));
  706. } else if (j != BreakoutTrip &&
  707. (ULO.TripMultiple == 0 || j % ULO.TripMultiple != 0)) {
  708. // If we know the trip count or a multiple of it, we can safely use an
  709. // unconditional branch for some iterations.
  710. NeedConditional = false;
  711. }
  712. }
  713. setDest(Latches[i], Dest, Headers[i], NeedConditional, ContinueOnTrue,
  714. Dest == LoopExit);
  715. }
  716. if (!LatchIsExiting) {
  717. // If the latch is not exiting, we may be able to simplify the conditional
  718. // branches in the unrolled exiting blocks.
  719. for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
  720. // The branch destination.
  721. unsigned j = (i + 1) % e;
  722. bool NeedConditional = true;
  723. if (RuntimeTripCount && j != 0)
  724. NeedConditional = false;
  725. if (CompletelyUnroll)
  726. // We cannot drop the conditional branch for the last condition, as we
  727. // may have to execute the loop body depending on the condition.
  728. NeedConditional = j == 0 || ULO.PreserveCondBr;
  729. else if (j != BreakoutTrip &&
  730. (ULO.TripMultiple == 0 || j % ULO.TripMultiple != 0))
  731. // If we know the trip count or a multiple of it, we can safely use an
  732. // unconditional branch for some iterations.
  733. NeedConditional = false;
  734. // Conditional branches from non-latch exiting block have successors
  735. // either in the same loop iteration or outside the loop. The branches are
  736. // already correct.
  737. if (NeedConditional)
  738. continue;
  739. setDest(ExitingBlocks[i], ExitingSucc[i], ExitingSucc[i], NeedConditional,
  740. None, false);
  741. }
  742. // When completely unrolling, the last latch becomes unreachable.
  743. if (CompletelyUnroll) {
  744. BranchInst *Term = cast<BranchInst>(Latches.back()->getTerminator());
  745. new UnreachableInst(Term->getContext(), Term);
  746. Term->eraseFromParent();
  747. }
  748. }
  749. // Update dominators of blocks we might reach through exits.
  750. // Immediate dominator of such block might change, because we add more
  751. // routes which can lead to the exit: we can now reach it from the copied
  752. // iterations too.
  753. if (DT && ULO.Count > 1) {
  754. for (auto *BB : OriginalLoopBlocks) {
  755. auto *BBDomNode = DT->getNode(BB);
  756. SmallVector<BasicBlock *, 16> ChildrenToUpdate;
  757. for (auto *ChildDomNode : BBDomNode->children()) {
  758. auto *ChildBB = ChildDomNode->getBlock();
  759. if (!L->contains(ChildBB))
  760. ChildrenToUpdate.push_back(ChildBB);
  761. }
  762. BasicBlock *NewIDom;
  763. if (ExitingBI && BB == ExitingBlocks[0]) {
  764. // The latch is special because we emit unconditional branches in
  765. // some cases where the original loop contained a conditional branch.
  766. // Since the latch is always at the bottom of the loop, if the latch
  767. // dominated an exit before unrolling, the new dominator of that exit
  768. // must also be a latch. Specifically, the dominator is the first
  769. // latch which ends in a conditional branch, or the last latch if
  770. // there is no such latch.
  771. // For loops exiting from non latch exiting block, we limit the
  772. // branch simplification to single exiting block loops.
  773. NewIDom = ExitingBlocks.back();
  774. for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
  775. Instruction *Term = ExitingBlocks[i]->getTerminator();
  776. if (isa<BranchInst>(Term) && cast<BranchInst>(Term)->isConditional()) {
  777. NewIDom =
  778. DT->findNearestCommonDominator(ExitingBlocks[i], Latches[i]);
  779. break;
  780. }
  781. }
  782. } else {
  783. // The new idom of the block will be the nearest common dominator
  784. // of all copies of the previous idom. This is equivalent to the
  785. // nearest common dominator of the previous idom and the first latch,
  786. // which dominates all copies of the previous idom.
  787. NewIDom = DT->findNearestCommonDominator(BB, LatchBlock);
  788. }
  789. for (auto *ChildBB : ChildrenToUpdate)
  790. DT->changeImmediateDominator(ChildBB, NewIDom);
  791. }
  792. }
  793. assert(!DT || !UnrollVerifyDomtree ||
  794. DT->verify(DominatorTree::VerificationLevel::Fast));
  795. DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
  796. // Merge adjacent basic blocks, if possible.
  797. for (BasicBlock *Latch : Latches) {
  798. BranchInst *Term = dyn_cast<BranchInst>(Latch->getTerminator());
  799. assert((Term ||
  800. (CompletelyUnroll && !LatchIsExiting && Latch == Latches.back())) &&
  801. "Need a branch as terminator, except when fully unrolling with "
  802. "unconditional latch");
  803. if (Term && Term->isUnconditional()) {
  804. BasicBlock *Dest = Term->getSuccessor(0);
  805. BasicBlock *Fold = Dest->getUniquePredecessor();
  806. if (MergeBlockIntoPredecessor(Dest, &DTU, LI)) {
  807. // Dest has been folded into Fold. Update our worklists accordingly.
  808. std::replace(Latches.begin(), Latches.end(), Dest, Fold);
  809. llvm::erase_value(UnrolledLoopBlocks, Dest);
  810. }
  811. }
  812. }
  813. // Apply updates to the DomTree.
  814. DT = &DTU.getDomTree();
  815. // At this point, the code is well formed. We now simplify the unrolled loop,
  816. // doing constant propagation and dead code elimination as we go.
  817. simplifyLoopAfterUnroll(L, !CompletelyUnroll && (ULO.Count > 1 || Peeled), LI,
  818. SE, DT, AC, TTI);
  819. NumCompletelyUnrolled += CompletelyUnroll;
  820. ++NumUnrolled;
  821. Loop *OuterL = L->getParentLoop();
  822. // Update LoopInfo if the loop is completely removed.
  823. if (CompletelyUnroll)
  824. LI->erase(L);
  825. // After complete unrolling most of the blocks should be contained in OuterL.
  826. // However, some of them might happen to be out of OuterL (e.g. if they
  827. // precede a loop exit). In this case we might need to insert PHI nodes in
  828. // order to preserve LCSSA form.
  829. // We don't need to check this if we already know that we need to fix LCSSA
  830. // form.
  831. // TODO: For now we just recompute LCSSA for the outer loop in this case, but
  832. // it should be possible to fix it in-place.
  833. if (PreserveLCSSA && OuterL && CompletelyUnroll && !NeedToFixLCSSA)
  834. NeedToFixLCSSA |= ::needToInsertPhisForLCSSA(OuterL, UnrolledLoopBlocks, LI);
  835. // If we have a pass and a DominatorTree we should re-simplify impacted loops
  836. // to ensure subsequent analyses can rely on this form. We want to simplify
  837. // at least one layer outside of the loop that was unrolled so that any
  838. // changes to the parent loop exposed by the unrolling are considered.
  839. if (DT) {
  840. if (OuterL) {
  841. // OuterL includes all loops for which we can break loop-simplify, so
  842. // it's sufficient to simplify only it (it'll recursively simplify inner
  843. // loops too).
  844. if (NeedToFixLCSSA) {
  845. // LCSSA must be performed on the outermost affected loop. The unrolled
  846. // loop's last loop latch is guaranteed to be in the outermost loop
  847. // after LoopInfo's been updated by LoopInfo::erase.
  848. Loop *LatchLoop = LI->getLoopFor(Latches.back());
  849. Loop *FixLCSSALoop = OuterL;
  850. if (!FixLCSSALoop->contains(LatchLoop))
  851. while (FixLCSSALoop->getParentLoop() != LatchLoop)
  852. FixLCSSALoop = FixLCSSALoop->getParentLoop();
  853. formLCSSARecursively(*FixLCSSALoop, *DT, LI, SE);
  854. } else if (PreserveLCSSA) {
  855. assert(OuterL->isLCSSAForm(*DT) &&
  856. "Loops should be in LCSSA form after loop-unroll.");
  857. }
  858. // TODO: That potentially might be compile-time expensive. We should try
  859. // to fix the loop-simplified form incrementally.
  860. simplifyLoop(OuterL, DT, LI, SE, AC, nullptr, PreserveLCSSA);
  861. } else {
  862. // Simplify loops for which we might've broken loop-simplify form.
  863. for (Loop *SubLoop : LoopsToSimplify)
  864. simplifyLoop(SubLoop, DT, LI, SE, AC, nullptr, PreserveLCSSA);
  865. }
  866. }
  867. return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
  868. : LoopUnrollResult::PartiallyUnrolled;
  869. }
  870. /// Given an llvm.loop loop id metadata node, returns the loop hint metadata
  871. /// node with the given name (for example, "llvm.loop.unroll.count"). If no
  872. /// such metadata node exists, then nullptr is returned.
  873. MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) {
  874. // First operand should refer to the loop id itself.
  875. assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
  876. assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
  877. for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
  878. MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
  879. if (!MD)
  880. continue;
  881. MDString *S = dyn_cast<MDString>(MD->getOperand(0));
  882. if (!S)
  883. continue;
  884. if (Name.equals(S->getString()))
  885. return MD;
  886. }
  887. return nullptr;
  888. }