LoopFuse.cpp 88 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160
  1. //===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. ///
  9. /// \file
  10. /// This file implements the loop fusion pass.
  11. /// The implementation is largely based on the following document:
  12. ///
  13. /// Code Transformations to Augment the Scope of Loop Fusion in a
  14. /// Production Compiler
  15. /// Christopher Mark Barton
  16. /// MSc Thesis
  17. /// https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf
  18. ///
  19. /// The general approach taken is to collect sets of control flow equivalent
  20. /// loops and test whether they can be fused. The necessary conditions for
  21. /// fusion are:
  22. /// 1. The loops must be adjacent (there cannot be any statements between
  23. /// the two loops).
  24. /// 2. The loops must be conforming (they must execute the same number of
  25. /// iterations).
  26. /// 3. The loops must be control flow equivalent (if one loop executes, the
  27. /// other is guaranteed to execute).
  28. /// 4. There cannot be any negative distance dependencies between the loops.
  29. /// If all of these conditions are satisfied, it is safe to fuse the loops.
  30. ///
  31. /// This implementation creates FusionCandidates that represent the loop and the
  32. /// necessary information needed by fusion. It then operates on the fusion
  33. /// candidates, first confirming that the candidate is eligible for fusion. The
  34. /// candidates are then collected into control flow equivalent sets, sorted in
  35. /// dominance order. Each set of control flow equivalent candidates is then
  36. /// traversed, attempting to fuse pairs of candidates in the set. If all
  37. /// requirements for fusion are met, the two candidates are fused, creating a
  38. /// new (fused) candidate which is then added back into the set to consider for
  39. /// additional fusion.
  40. ///
  41. /// This implementation currently does not make any modifications to remove
  42. /// conditions for fusion. Code transformations to make loops conform to each of
  43. /// the conditions for fusion are discussed in more detail in the document
  44. /// above. These can be added to the current implementation in the future.
  45. //===----------------------------------------------------------------------===//
  46. #include "llvm/Transforms/Scalar/LoopFuse.h"
  47. #include "llvm/ADT/Statistic.h"
  48. #include "llvm/Analysis/AssumptionCache.h"
  49. #include "llvm/Analysis/DependenceAnalysis.h"
  50. #include "llvm/Analysis/DomTreeUpdater.h"
  51. #include "llvm/Analysis/LoopInfo.h"
  52. #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  53. #include "llvm/Analysis/PostDominators.h"
  54. #include "llvm/Analysis/ScalarEvolution.h"
  55. #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  56. #include "llvm/Analysis/TargetTransformInfo.h"
  57. #include "llvm/IR/Function.h"
  58. #include "llvm/IR/Verifier.h"
  59. #include "llvm/InitializePasses.h"
  60. #include "llvm/Pass.h"
  61. #include "llvm/Support/CommandLine.h"
  62. #include "llvm/Support/Debug.h"
  63. #include "llvm/Support/raw_ostream.h"
  64. #include "llvm/Transforms/Scalar.h"
  65. #include "llvm/Transforms/Utils.h"
  66. #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  67. #include "llvm/Transforms/Utils/CodeMoverUtils.h"
  68. #include "llvm/Transforms/Utils/LoopPeel.h"
  69. #include "llvm/Transforms/Utils/LoopSimplify.h"
  70. using namespace llvm;
  71. #define DEBUG_TYPE "loop-fusion"
  72. STATISTIC(FuseCounter, "Loops fused");
  73. STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion");
  74. STATISTIC(InvalidPreheader, "Loop has invalid preheader");
  75. STATISTIC(InvalidHeader, "Loop has invalid header");
  76. STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks");
  77. STATISTIC(InvalidExitBlock, "Loop has invalid exit block");
  78. STATISTIC(InvalidLatch, "Loop has invalid latch");
  79. STATISTIC(InvalidLoop, "Loop is invalid");
  80. STATISTIC(AddressTakenBB, "Basic block has address taken");
  81. STATISTIC(MayThrowException, "Loop may throw an exception");
  82. STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access");
  83. STATISTIC(NotSimplifiedForm, "Loop is not in simplified form");
  84. STATISTIC(InvalidDependencies, "Dependencies prevent fusion");
  85. STATISTIC(UnknownTripCount, "Loop has unknown trip count");
  86. STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop");
  87. STATISTIC(NonEqualTripCount, "Loop trip counts are not the same");
  88. STATISTIC(NonAdjacent, "Loops are not adjacent");
  89. STATISTIC(
  90. NonEmptyPreheader,
  91. "Loop has a non-empty preheader with instructions that cannot be moved");
  92. STATISTIC(FusionNotBeneficial, "Fusion is not beneficial");
  93. STATISTIC(NonIdenticalGuards, "Candidates have different guards");
  94. STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with "
  95. "instructions that cannot be moved");
  96. STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with "
  97. "instructions that cannot be moved");
  98. STATISTIC(NotRotated, "Candidate is not rotated");
  99. STATISTIC(OnlySecondCandidateIsGuarded,
  100. "The second candidate is guarded while the first one is not");
  101. STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions.");
  102. STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions.");
  103. enum FusionDependenceAnalysisChoice {
  104. FUSION_DEPENDENCE_ANALYSIS_SCEV,
  105. FUSION_DEPENDENCE_ANALYSIS_DA,
  106. FUSION_DEPENDENCE_ANALYSIS_ALL,
  107. };
  108. static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis(
  109. "loop-fusion-dependence-analysis",
  110. cl::desc("Which dependence analysis should loop fusion use?"),
  111. cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev",
  112. "Use the scalar evolution interface"),
  113. clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da",
  114. "Use the dependence analysis interface"),
  115. clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all",
  116. "Use all available analyses")),
  117. cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL));
  118. static cl::opt<unsigned> FusionPeelMaxCount(
  119. "loop-fusion-peel-max-count", cl::init(0), cl::Hidden,
  120. cl::desc("Max number of iterations to be peeled from a loop, such that "
  121. "fusion can take place"));
  122. #ifndef NDEBUG
  123. static cl::opt<bool>
  124. VerboseFusionDebugging("loop-fusion-verbose-debug",
  125. cl::desc("Enable verbose debugging for Loop Fusion"),
  126. cl::Hidden, cl::init(false));
  127. #endif
  128. namespace {
  129. /// This class is used to represent a candidate for loop fusion. When it is
  130. /// constructed, it checks the conditions for loop fusion to ensure that it
  131. /// represents a valid candidate. It caches several parts of a loop that are
  132. /// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead
  133. /// of continually querying the underlying Loop to retrieve these values. It is
  134. /// assumed these will not change throughout loop fusion.
  135. ///
  136. /// The invalidate method should be used to indicate that the FusionCandidate is
  137. /// no longer a valid candidate for fusion. Similarly, the isValid() method can
  138. /// be used to ensure that the FusionCandidate is still valid for fusion.
  139. struct FusionCandidate {
  140. /// Cache of parts of the loop used throughout loop fusion. These should not
  141. /// need to change throughout the analysis and transformation.
  142. /// These parts are cached to avoid repeatedly looking up in the Loop class.
  143. /// Preheader of the loop this candidate represents
  144. BasicBlock *Preheader;
  145. /// Header of the loop this candidate represents
  146. BasicBlock *Header;
  147. /// Blocks in the loop that exit the loop
  148. BasicBlock *ExitingBlock;
  149. /// The successor block of this loop (where the exiting blocks go to)
  150. BasicBlock *ExitBlock;
  151. /// Latch of the loop
  152. BasicBlock *Latch;
  153. /// The loop that this fusion candidate represents
  154. Loop *L;
  155. /// Vector of instructions in this loop that read from memory
  156. SmallVector<Instruction *, 16> MemReads;
  157. /// Vector of instructions in this loop that write to memory
  158. SmallVector<Instruction *, 16> MemWrites;
  159. /// Are all of the members of this fusion candidate still valid
  160. bool Valid;
  161. /// Guard branch of the loop, if it exists
  162. BranchInst *GuardBranch;
  163. /// Peeling Paramaters of the Loop.
  164. TTI::PeelingPreferences PP;
  165. /// Can you Peel this Loop?
  166. bool AbleToPeel;
  167. /// Has this loop been Peeled
  168. bool Peeled;
  169. /// Dominator and PostDominator trees are needed for the
  170. /// FusionCandidateCompare function, required by FusionCandidateSet to
  171. /// determine where the FusionCandidate should be inserted into the set. These
  172. /// are used to establish ordering of the FusionCandidates based on dominance.
  173. DominatorTree &DT;
  174. const PostDominatorTree *PDT;
  175. OptimizationRemarkEmitter &ORE;
  176. FusionCandidate(Loop *L, DominatorTree &DT, const PostDominatorTree *PDT,
  177. OptimizationRemarkEmitter &ORE, TTI::PeelingPreferences PP)
  178. : Preheader(L->getLoopPreheader()), Header(L->getHeader()),
  179. ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),
  180. Latch(L->getLoopLatch()), L(L), Valid(true),
  181. GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)),
  182. Peeled(false), DT(DT), PDT(PDT), ORE(ORE) {
  183. // Walk over all blocks in the loop and check for conditions that may
  184. // prevent fusion. For each block, walk over all instructions and collect
  185. // the memory reads and writes If any instructions that prevent fusion are
  186. // found, invalidate this object and return.
  187. for (BasicBlock *BB : L->blocks()) {
  188. if (BB->hasAddressTaken()) {
  189. invalidate();
  190. reportInvalidCandidate(AddressTakenBB);
  191. return;
  192. }
  193. for (Instruction &I : *BB) {
  194. if (I.mayThrow()) {
  195. invalidate();
  196. reportInvalidCandidate(MayThrowException);
  197. return;
  198. }
  199. if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
  200. if (SI->isVolatile()) {
  201. invalidate();
  202. reportInvalidCandidate(ContainsVolatileAccess);
  203. return;
  204. }
  205. }
  206. if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
  207. if (LI->isVolatile()) {
  208. invalidate();
  209. reportInvalidCandidate(ContainsVolatileAccess);
  210. return;
  211. }
  212. }
  213. if (I.mayWriteToMemory())
  214. MemWrites.push_back(&I);
  215. if (I.mayReadFromMemory())
  216. MemReads.push_back(&I);
  217. }
  218. }
  219. }
  220. /// Check if all members of the class are valid.
  221. bool isValid() const {
  222. return Preheader && Header && ExitingBlock && ExitBlock && Latch && L &&
  223. !L->isInvalid() && Valid;
  224. }
  225. /// Verify that all members are in sync with the Loop object.
  226. void verify() const {
  227. assert(isValid() && "Candidate is not valid!!");
  228. assert(!L->isInvalid() && "Loop is invalid!");
  229. assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync");
  230. assert(Header == L->getHeader() && "Header is out of sync");
  231. assert(ExitingBlock == L->getExitingBlock() &&
  232. "Exiting Blocks is out of sync");
  233. assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync");
  234. assert(Latch == L->getLoopLatch() && "Latch is out of sync");
  235. }
  236. /// Get the entry block for this fusion candidate.
  237. ///
  238. /// If this fusion candidate represents a guarded loop, the entry block is the
  239. /// loop guard block. If it represents an unguarded loop, the entry block is
  240. /// the preheader of the loop.
  241. BasicBlock *getEntryBlock() const {
  242. if (GuardBranch)
  243. return GuardBranch->getParent();
  244. else
  245. return Preheader;
  246. }
  247. /// After Peeling the loop is modified quite a bit, hence all of the Blocks
  248. /// need to be updated accordingly.
  249. void updateAfterPeeling() {
  250. Preheader = L->getLoopPreheader();
  251. Header = L->getHeader();
  252. ExitingBlock = L->getExitingBlock();
  253. ExitBlock = L->getExitBlock();
  254. Latch = L->getLoopLatch();
  255. verify();
  256. }
  257. /// Given a guarded loop, get the successor of the guard that is not in the
  258. /// loop.
  259. ///
  260. /// This method returns the successor of the loop guard that is not located
  261. /// within the loop (i.e., the successor of the guard that is not the
  262. /// preheader).
  263. /// This method is only valid for guarded loops.
  264. BasicBlock *getNonLoopBlock() const {
  265. assert(GuardBranch && "Only valid on guarded loops.");
  266. assert(GuardBranch->isConditional() &&
  267. "Expecting guard to be a conditional branch.");
  268. if (Peeled)
  269. return GuardBranch->getSuccessor(1);
  270. return (GuardBranch->getSuccessor(0) == Preheader)
  271. ? GuardBranch->getSuccessor(1)
  272. : GuardBranch->getSuccessor(0);
  273. }
  274. #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
  275. LLVM_DUMP_METHOD void dump() const {
  276. dbgs() << "\tGuardBranch: ";
  277. if (GuardBranch)
  278. dbgs() << *GuardBranch;
  279. else
  280. dbgs() << "nullptr";
  281. dbgs() << "\n"
  282. << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n"
  283. << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")
  284. << "\n"
  285. << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n"
  286. << "\tExitingBB: "
  287. << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n"
  288. << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr")
  289. << "\n"
  290. << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n"
  291. << "\tEntryBlock: "
  292. << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr")
  293. << "\n";
  294. }
  295. #endif
  296. /// Determine if a fusion candidate (representing a loop) is eligible for
  297. /// fusion. Note that this only checks whether a single loop can be fused - it
  298. /// does not check whether it is *legal* to fuse two loops together.
  299. bool isEligibleForFusion(ScalarEvolution &SE) const {
  300. if (!isValid()) {
  301. LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n");
  302. if (!Preheader)
  303. ++InvalidPreheader;
  304. if (!Header)
  305. ++InvalidHeader;
  306. if (!ExitingBlock)
  307. ++InvalidExitingBlock;
  308. if (!ExitBlock)
  309. ++InvalidExitBlock;
  310. if (!Latch)
  311. ++InvalidLatch;
  312. if (L->isInvalid())
  313. ++InvalidLoop;
  314. return false;
  315. }
  316. // Require ScalarEvolution to be able to determine a trip count.
  317. if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
  318. LLVM_DEBUG(dbgs() << "Loop " << L->getName()
  319. << " trip count not computable!\n");
  320. return reportInvalidCandidate(UnknownTripCount);
  321. }
  322. if (!L->isLoopSimplifyForm()) {
  323. LLVM_DEBUG(dbgs() << "Loop " << L->getName()
  324. << " is not in simplified form!\n");
  325. return reportInvalidCandidate(NotSimplifiedForm);
  326. }
  327. if (!L->isRotatedForm()) {
  328. LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n");
  329. return reportInvalidCandidate(NotRotated);
  330. }
  331. return true;
  332. }
  333. private:
  334. // This is only used internally for now, to clear the MemWrites and MemReads
  335. // list and setting Valid to false. I can't envision other uses of this right
  336. // now, since once FusionCandidates are put into the FusionCandidateSet they
  337. // are immutable. Thus, any time we need to change/update a FusionCandidate,
  338. // we must create a new one and insert it into the FusionCandidateSet to
  339. // ensure the FusionCandidateSet remains ordered correctly.
  340. void invalidate() {
  341. MemWrites.clear();
  342. MemReads.clear();
  343. Valid = false;
  344. }
  345. bool reportInvalidCandidate(llvm::Statistic &Stat) const {
  346. using namespace ore;
  347. assert(L && Preheader && "Fusion candidate not initialized properly!");
  348. #if LLVM_ENABLE_STATS
  349. ++Stat;
  350. ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(),
  351. L->getStartLoc(), Preheader)
  352. << "[" << Preheader->getParent()->getName() << "]: "
  353. << "Loop is not a candidate for fusion: " << Stat.getDesc());
  354. #endif
  355. return false;
  356. }
  357. };
  358. struct FusionCandidateCompare {
  359. /// Comparison functor to sort two Control Flow Equivalent fusion candidates
  360. /// into dominance order.
  361. /// If LHS dominates RHS and RHS post-dominates LHS, return true;
  362. /// If RHS dominates LHS and LHS post-dominates RHS, return false;
  363. /// If both LHS and RHS are not dominating each other then, non-strictly
  364. /// post dominate check will decide the order of candidates. If RHS
  365. /// non-strictly post dominates LHS then, return true. If LHS non-strictly
  366. /// post dominates RHS then, return false. If both are non-strictly post
  367. /// dominate each other then, level in the post dominator tree will decide
  368. /// the order of candidates.
  369. bool operator()(const FusionCandidate &LHS,
  370. const FusionCandidate &RHS) const {
  371. const DominatorTree *DT = &(LHS.DT);
  372. BasicBlock *LHSEntryBlock = LHS.getEntryBlock();
  373. BasicBlock *RHSEntryBlock = RHS.getEntryBlock();
  374. // Do not save PDT to local variable as it is only used in asserts and thus
  375. // will trigger an unused variable warning if building without asserts.
  376. assert(DT && LHS.PDT && "Expecting valid dominator tree");
  377. // Do this compare first so if LHS == RHS, function returns false.
  378. if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) {
  379. // RHS dominates LHS
  380. // Verify LHS post-dominates RHS
  381. assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock));
  382. return false;
  383. }
  384. if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) {
  385. // Verify RHS Postdominates LHS
  386. assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock));
  387. return true;
  388. }
  389. // If two FusionCandidates are in the same level of dominator tree,
  390. // they will not dominate each other, but may still be control flow
  391. // equivalent. To sort those FusionCandidates, nonStrictlyPostDominate()
  392. // function is needed.
  393. bool WrongOrder =
  394. nonStrictlyPostDominate(LHSEntryBlock, RHSEntryBlock, DT, LHS.PDT);
  395. bool RightOrder =
  396. nonStrictlyPostDominate(RHSEntryBlock, LHSEntryBlock, DT, LHS.PDT);
  397. if (WrongOrder && RightOrder) {
  398. // If common predecessor of LHS and RHS post dominates both
  399. // FusionCandidates then, Order of FusionCandidate can be
  400. // identified by its level in post dominator tree.
  401. DomTreeNode *LNode = LHS.PDT->getNode(LHSEntryBlock);
  402. DomTreeNode *RNode = LHS.PDT->getNode(RHSEntryBlock);
  403. return LNode->getLevel() > RNode->getLevel();
  404. } else if (WrongOrder)
  405. return false;
  406. else if (RightOrder)
  407. return true;
  408. // If LHS does not non-strict Postdominate RHS and RHS does not non-strict
  409. // Postdominate LHS then, there is no dominance relationship between the
  410. // two FusionCandidates. Thus, they should not be in the same set together.
  411. llvm_unreachable(
  412. "No dominance relationship between these fusion candidates!");
  413. }
  414. };
  415. using LoopVector = SmallVector<Loop *, 4>;
  416. // Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance
  417. // order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0
  418. // dominates FC1 and FC1 post-dominates FC0.
  419. // std::set was chosen because we want a sorted data structure with stable
  420. // iterators. A subsequent patch to loop fusion will enable fusing non-adjacent
  421. // loops by moving intervening code around. When this intervening code contains
  422. // loops, those loops will be moved also. The corresponding FusionCandidates
  423. // will also need to be moved accordingly. As this is done, having stable
  424. // iterators will simplify the logic. Similarly, having an efficient insert that
  425. // keeps the FusionCandidateSet sorted will also simplify the implementation.
  426. using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>;
  427. using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>;
  428. #if !defined(NDEBUG)
  429. static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
  430. const FusionCandidate &FC) {
  431. if (FC.isValid())
  432. OS << FC.Preheader->getName();
  433. else
  434. OS << "<Invalid>";
  435. return OS;
  436. }
  437. static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
  438. const FusionCandidateSet &CandSet) {
  439. for (const FusionCandidate &FC : CandSet)
  440. OS << FC << '\n';
  441. return OS;
  442. }
  443. static void
  444. printFusionCandidates(const FusionCandidateCollection &FusionCandidates) {
  445. dbgs() << "Fusion Candidates: \n";
  446. for (const auto &CandidateSet : FusionCandidates) {
  447. dbgs() << "*** Fusion Candidate Set ***\n";
  448. dbgs() << CandidateSet;
  449. dbgs() << "****************************\n";
  450. }
  451. }
  452. #endif
  453. /// Collect all loops in function at the same nest level, starting at the
  454. /// outermost level.
  455. ///
  456. /// This data structure collects all loops at the same nest level for a
  457. /// given function (specified by the LoopInfo object). It starts at the
  458. /// outermost level.
  459. struct LoopDepthTree {
  460. using LoopsOnLevelTy = SmallVector<LoopVector, 4>;
  461. using iterator = LoopsOnLevelTy::iterator;
  462. using const_iterator = LoopsOnLevelTy::const_iterator;
  463. LoopDepthTree(LoopInfo &LI) : Depth(1) {
  464. if (!LI.empty())
  465. LoopsOnLevel.emplace_back(LoopVector(LI.rbegin(), LI.rend()));
  466. }
  467. /// Test whether a given loop has been removed from the function, and thus is
  468. /// no longer valid.
  469. bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(L); }
  470. /// Record that a given loop has been removed from the function and is no
  471. /// longer valid.
  472. void removeLoop(const Loop *L) { RemovedLoops.insert(L); }
  473. /// Descend the tree to the next (inner) nesting level
  474. void descend() {
  475. LoopsOnLevelTy LoopsOnNextLevel;
  476. for (const LoopVector &LV : *this)
  477. for (Loop *L : LV)
  478. if (!isRemovedLoop(L) && L->begin() != L->end())
  479. LoopsOnNextLevel.emplace_back(LoopVector(L->begin(), L->end()));
  480. LoopsOnLevel = LoopsOnNextLevel;
  481. RemovedLoops.clear();
  482. Depth++;
  483. }
  484. bool empty() const { return size() == 0; }
  485. size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); }
  486. unsigned getDepth() const { return Depth; }
  487. iterator begin() { return LoopsOnLevel.begin(); }
  488. iterator end() { return LoopsOnLevel.end(); }
  489. const_iterator begin() const { return LoopsOnLevel.begin(); }
  490. const_iterator end() const { return LoopsOnLevel.end(); }
  491. private:
  492. /// Set of loops that have been removed from the function and are no longer
  493. /// valid.
  494. SmallPtrSet<const Loop *, 8> RemovedLoops;
  495. /// Depth of the current level, starting at 1 (outermost loops).
  496. unsigned Depth;
  497. /// Vector of loops at the current depth level that have the same parent loop
  498. LoopsOnLevelTy LoopsOnLevel;
  499. };
  500. #ifndef NDEBUG
  501. static void printLoopVector(const LoopVector &LV) {
  502. dbgs() << "****************************\n";
  503. for (auto *L : LV)
  504. printLoop(*L, dbgs());
  505. dbgs() << "****************************\n";
  506. }
  507. #endif
  508. struct LoopFuser {
  509. private:
  510. // Sets of control flow equivalent fusion candidates for a given nest level.
  511. FusionCandidateCollection FusionCandidates;
  512. LoopDepthTree LDT;
  513. DomTreeUpdater DTU;
  514. LoopInfo &LI;
  515. DominatorTree &DT;
  516. DependenceInfo &DI;
  517. ScalarEvolution &SE;
  518. PostDominatorTree &PDT;
  519. OptimizationRemarkEmitter &ORE;
  520. AssumptionCache &AC;
  521. const TargetTransformInfo &TTI;
  522. public:
  523. LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI,
  524. ScalarEvolution &SE, PostDominatorTree &PDT,
  525. OptimizationRemarkEmitter &ORE, const DataLayout &DL,
  526. AssumptionCache &AC, const TargetTransformInfo &TTI)
  527. : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI),
  528. DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {}
  529. /// This is the main entry point for loop fusion. It will traverse the
  530. /// specified function and collect candidate loops to fuse, starting at the
  531. /// outermost nesting level and working inwards.
  532. bool fuseLoops(Function &F) {
  533. #ifndef NDEBUG
  534. if (VerboseFusionDebugging) {
  535. LI.print(dbgs());
  536. }
  537. #endif
  538. LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName()
  539. << "\n");
  540. bool Changed = false;
  541. while (!LDT.empty()) {
  542. LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth "
  543. << LDT.getDepth() << "\n";);
  544. for (const LoopVector &LV : LDT) {
  545. assert(LV.size() > 0 && "Empty loop set was build!");
  546. // Skip singleton loop sets as they do not offer fusion opportunities on
  547. // this level.
  548. if (LV.size() == 1)
  549. continue;
  550. #ifndef NDEBUG
  551. if (VerboseFusionDebugging) {
  552. LLVM_DEBUG({
  553. dbgs() << " Visit loop set (#" << LV.size() << "):\n";
  554. printLoopVector(LV);
  555. });
  556. }
  557. #endif
  558. collectFusionCandidates(LV);
  559. Changed |= fuseCandidates();
  560. }
  561. // Finished analyzing candidates at this level.
  562. // Descend to the next level and clear all of the candidates currently
  563. // collected. Note that it will not be possible to fuse any of the
  564. // existing candidates with new candidates because the new candidates will
  565. // be at a different nest level and thus not be control flow equivalent
  566. // with all of the candidates collected so far.
  567. LLVM_DEBUG(dbgs() << "Descend one level!\n");
  568. LDT.descend();
  569. FusionCandidates.clear();
  570. }
  571. if (Changed)
  572. LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n"; F.dump(););
  573. #ifndef NDEBUG
  574. assert(DT.verify());
  575. assert(PDT.verify());
  576. LI.verify(DT);
  577. SE.verify();
  578. #endif
  579. LLVM_DEBUG(dbgs() << "Loop Fusion complete\n");
  580. return Changed;
  581. }
  582. private:
  583. /// Determine if two fusion candidates are control flow equivalent.
  584. ///
  585. /// Two fusion candidates are control flow equivalent if when one executes,
  586. /// the other is guaranteed to execute. This is determined using dominators
  587. /// and post-dominators: if A dominates B and B post-dominates A then A and B
  588. /// are control-flow equivalent.
  589. bool isControlFlowEquivalent(const FusionCandidate &FC0,
  590. const FusionCandidate &FC1) const {
  591. assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders");
  592. return ::isControlFlowEquivalent(*FC0.getEntryBlock(), *FC1.getEntryBlock(),
  593. DT, PDT);
  594. }
  595. /// Iterate over all loops in the given loop set and identify the loops that
  596. /// are eligible for fusion. Place all eligible fusion candidates into Control
  597. /// Flow Equivalent sets, sorted by dominance.
  598. void collectFusionCandidates(const LoopVector &LV) {
  599. for (Loop *L : LV) {
  600. TTI::PeelingPreferences PP =
  601. gatherPeelingPreferences(L, SE, TTI, std::nullopt, std::nullopt);
  602. FusionCandidate CurrCand(L, DT, &PDT, ORE, PP);
  603. if (!CurrCand.isEligibleForFusion(SE))
  604. continue;
  605. // Go through each list in FusionCandidates and determine if L is control
  606. // flow equivalent with the first loop in that list. If it is, append LV.
  607. // If not, go to the next list.
  608. // If no suitable list is found, start another list and add it to
  609. // FusionCandidates.
  610. bool FoundSet = false;
  611. for (auto &CurrCandSet : FusionCandidates) {
  612. if (isControlFlowEquivalent(*CurrCandSet.begin(), CurrCand)) {
  613. CurrCandSet.insert(CurrCand);
  614. FoundSet = true;
  615. #ifndef NDEBUG
  616. if (VerboseFusionDebugging)
  617. LLVM_DEBUG(dbgs() << "Adding " << CurrCand
  618. << " to existing candidate set\n");
  619. #endif
  620. break;
  621. }
  622. }
  623. if (!FoundSet) {
  624. // No set was found. Create a new set and add to FusionCandidates
  625. #ifndef NDEBUG
  626. if (VerboseFusionDebugging)
  627. LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n");
  628. #endif
  629. FusionCandidateSet NewCandSet;
  630. NewCandSet.insert(CurrCand);
  631. FusionCandidates.push_back(NewCandSet);
  632. }
  633. NumFusionCandidates++;
  634. }
  635. }
  636. /// Determine if it is beneficial to fuse two loops.
  637. ///
  638. /// For now, this method simply returns true because we want to fuse as much
  639. /// as possible (primarily to test the pass). This method will evolve, over
  640. /// time, to add heuristics for profitability of fusion.
  641. bool isBeneficialFusion(const FusionCandidate &FC0,
  642. const FusionCandidate &FC1) {
  643. return true;
  644. }
  645. /// Determine if two fusion candidates have the same trip count (i.e., they
  646. /// execute the same number of iterations).
  647. ///
  648. /// This function will return a pair of values. The first is a boolean,
  649. /// stating whether or not the two candidates are known at compile time to
  650. /// have the same TripCount. The second is the difference in the two
  651. /// TripCounts. This information can be used later to determine whether or not
  652. /// peeling can be performed on either one of the candidates.
  653. std::pair<bool, std::optional<unsigned>>
  654. haveIdenticalTripCounts(const FusionCandidate &FC0,
  655. const FusionCandidate &FC1) const {
  656. const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L);
  657. if (isa<SCEVCouldNotCompute>(TripCount0)) {
  658. UncomputableTripCount++;
  659. LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!");
  660. return {false, std::nullopt};
  661. }
  662. const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L);
  663. if (isa<SCEVCouldNotCompute>(TripCount1)) {
  664. UncomputableTripCount++;
  665. LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!");
  666. return {false, std::nullopt};
  667. }
  668. LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & "
  669. << *TripCount1 << " are "
  670. << (TripCount0 == TripCount1 ? "identical" : "different")
  671. << "\n");
  672. if (TripCount0 == TripCount1)
  673. return {true, 0};
  674. LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, "
  675. "determining the difference between trip counts\n");
  676. // Currently only considering loops with a single exit point
  677. // and a non-constant trip count.
  678. const unsigned TC0 = SE.getSmallConstantTripCount(FC0.L);
  679. const unsigned TC1 = SE.getSmallConstantTripCount(FC1.L);
  680. // If any of the tripcounts are zero that means that loop(s) do not have
  681. // a single exit or a constant tripcount.
  682. if (TC0 == 0 || TC1 == 0) {
  683. LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not "
  684. "have a constant number of iterations. Peeling "
  685. "is not benefical\n");
  686. return {false, std::nullopt};
  687. }
  688. std::optional<unsigned> Difference;
  689. int Diff = TC0 - TC1;
  690. if (Diff > 0)
  691. Difference = Diff;
  692. else {
  693. LLVM_DEBUG(
  694. dbgs() << "Difference is less than 0. FC1 (second loop) has more "
  695. "iterations than the first one. Currently not supported\n");
  696. }
  697. LLVM_DEBUG(dbgs() << "Difference in loop trip count is: " << Difference
  698. << "\n");
  699. return {false, Difference};
  700. }
  701. void peelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1,
  702. unsigned PeelCount) {
  703. assert(FC0.AbleToPeel && "Should be able to peel loop");
  704. LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount
  705. << " iterations of the first loop. \n");
  706. ValueToValueMapTy VMap;
  707. FC0.Peeled = peelLoop(FC0.L, PeelCount, &LI, &SE, DT, &AC, true, VMap);
  708. if (FC0.Peeled) {
  709. LLVM_DEBUG(dbgs() << "Done Peeling\n");
  710. #ifndef NDEBUG
  711. auto IdenticalTripCount = haveIdenticalTripCounts(FC0, FC1);
  712. assert(IdenticalTripCount.first && *IdenticalTripCount.second == 0 &&
  713. "Loops should have identical trip counts after peeling");
  714. #endif
  715. FC0.PP.PeelCount += PeelCount;
  716. // Peeling does not update the PDT
  717. PDT.recalculate(*FC0.Preheader->getParent());
  718. FC0.updateAfterPeeling();
  719. // In this case the iterations of the loop are constant, so the first
  720. // loop will execute completely (will not jump from one of
  721. // the peeled blocks to the second loop). Here we are updating the
  722. // branch conditions of each of the peeled blocks, such that it will
  723. // branch to its successor which is not the preheader of the second loop
  724. // in the case of unguarded loops, or the succesors of the exit block of
  725. // the first loop otherwise. Doing this update will ensure that the entry
  726. // block of the first loop dominates the entry block of the second loop.
  727. BasicBlock *BB =
  728. FC0.GuardBranch ? FC0.ExitBlock->getUniqueSuccessor() : FC1.Preheader;
  729. if (BB) {
  730. SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
  731. SmallVector<Instruction *, 8> WorkList;
  732. for (BasicBlock *Pred : predecessors(BB)) {
  733. if (Pred != FC0.ExitBlock) {
  734. WorkList.emplace_back(Pred->getTerminator());
  735. TreeUpdates.emplace_back(
  736. DominatorTree::UpdateType(DominatorTree::Delete, Pred, BB));
  737. }
  738. }
  739. // Cannot modify the predecessors inside the above loop as it will cause
  740. // the iterators to be nullptrs, causing memory errors.
  741. for (Instruction *CurrentBranch : WorkList) {
  742. BasicBlock *Succ = CurrentBranch->getSuccessor(0);
  743. if (Succ == BB)
  744. Succ = CurrentBranch->getSuccessor(1);
  745. ReplaceInstWithInst(CurrentBranch, BranchInst::Create(Succ));
  746. }
  747. DTU.applyUpdates(TreeUpdates);
  748. DTU.flush();
  749. }
  750. LLVM_DEBUG(
  751. dbgs() << "Sucessfully peeled " << FC0.PP.PeelCount
  752. << " iterations from the first loop.\n"
  753. "Both Loops have the same number of iterations now.\n");
  754. }
  755. }
  756. /// Walk each set of control flow equivalent fusion candidates and attempt to
  757. /// fuse them. This does a single linear traversal of all candidates in the
  758. /// set. The conditions for legal fusion are checked at this point. If a pair
  759. /// of fusion candidates passes all legality checks, they are fused together
  760. /// and a new fusion candidate is created and added to the FusionCandidateSet.
  761. /// The original fusion candidates are then removed, as they are no longer
  762. /// valid.
  763. bool fuseCandidates() {
  764. bool Fused = false;
  765. LLVM_DEBUG(printFusionCandidates(FusionCandidates));
  766. for (auto &CandidateSet : FusionCandidates) {
  767. if (CandidateSet.size() < 2)
  768. continue;
  769. LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n"
  770. << CandidateSet << "\n");
  771. for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) {
  772. assert(!LDT.isRemovedLoop(FC0->L) &&
  773. "Should not have removed loops in CandidateSet!");
  774. auto FC1 = FC0;
  775. for (++FC1; FC1 != CandidateSet.end(); ++FC1) {
  776. assert(!LDT.isRemovedLoop(FC1->L) &&
  777. "Should not have removed loops in CandidateSet!");
  778. LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n"; FC0->dump();
  779. dbgs() << " with\n"; FC1->dump(); dbgs() << "\n");
  780. FC0->verify();
  781. FC1->verify();
  782. // Check if the candidates have identical tripcounts (first value of
  783. // pair), and if not check the difference in the tripcounts between
  784. // the loops (second value of pair). The difference is not equal to
  785. // std::nullopt iff the loops iterate a constant number of times, and
  786. // have a single exit.
  787. std::pair<bool, std::optional<unsigned>> IdenticalTripCountRes =
  788. haveIdenticalTripCounts(*FC0, *FC1);
  789. bool SameTripCount = IdenticalTripCountRes.first;
  790. std::optional<unsigned> TCDifference = IdenticalTripCountRes.second;
  791. // Here we are checking that FC0 (the first loop) can be peeled, and
  792. // both loops have different tripcounts.
  793. if (FC0->AbleToPeel && !SameTripCount && TCDifference) {
  794. if (*TCDifference > FusionPeelMaxCount) {
  795. LLVM_DEBUG(dbgs()
  796. << "Difference in loop trip counts: " << *TCDifference
  797. << " is greater than maximum peel count specificed: "
  798. << FusionPeelMaxCount << "\n");
  799. } else {
  800. // Dependent on peeling being performed on the first loop, and
  801. // assuming all other conditions for fusion return true.
  802. SameTripCount = true;
  803. }
  804. }
  805. if (!SameTripCount) {
  806. LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip "
  807. "counts. Not fusing.\n");
  808. reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
  809. NonEqualTripCount);
  810. continue;
  811. }
  812. if (!isAdjacent(*FC0, *FC1)) {
  813. LLVM_DEBUG(dbgs()
  814. << "Fusion candidates are not adjacent. Not fusing.\n");
  815. reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent);
  816. continue;
  817. }
  818. if ((!FC0->GuardBranch && FC1->GuardBranch) ||
  819. (FC0->GuardBranch && !FC1->GuardBranch)) {
  820. LLVM_DEBUG(dbgs() << "The one of candidate is guarded while the "
  821. "another one is not. Not fusing.\n");
  822. reportLoopFusion<OptimizationRemarkMissed>(
  823. *FC0, *FC1, OnlySecondCandidateIsGuarded);
  824. continue;
  825. }
  826. // Ensure that FC0 and FC1 have identical guards.
  827. // If one (or both) are not guarded, this check is not necessary.
  828. if (FC0->GuardBranch && FC1->GuardBranch &&
  829. !haveIdenticalGuards(*FC0, *FC1) && !TCDifference) {
  830. LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical "
  831. "guards. Not Fusing.\n");
  832. reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
  833. NonIdenticalGuards);
  834. continue;
  835. }
  836. if (FC0->GuardBranch) {
  837. assert(FC1->GuardBranch && "Expecting valid FC1 guard branch");
  838. if (!isSafeToMoveBefore(*FC0->ExitBlock,
  839. *FC1->ExitBlock->getFirstNonPHIOrDbg(), DT,
  840. &PDT, &DI)) {
  841. LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe "
  842. "instructions in exit block. Not fusing.\n");
  843. reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
  844. NonEmptyExitBlock);
  845. continue;
  846. }
  847. if (!isSafeToMoveBefore(
  848. *FC1->GuardBranch->getParent(),
  849. *FC0->GuardBranch->getParent()->getTerminator(), DT, &PDT,
  850. &DI)) {
  851. LLVM_DEBUG(dbgs()
  852. << "Fusion candidate contains unsafe "
  853. "instructions in guard block. Not fusing.\n");
  854. reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
  855. NonEmptyGuardBlock);
  856. continue;
  857. }
  858. }
  859. // Check the dependencies across the loops and do not fuse if it would
  860. // violate them.
  861. if (!dependencesAllowFusion(*FC0, *FC1)) {
  862. LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n");
  863. reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
  864. InvalidDependencies);
  865. continue;
  866. }
  867. // If the second loop has instructions in the pre-header, attempt to
  868. // hoist them up to the first loop's pre-header or sink them into the
  869. // body of the second loop.
  870. SmallVector<Instruction *, 4> SafeToHoist;
  871. SmallVector<Instruction *, 4> SafeToSink;
  872. // At this point, this is the last remaining legality check.
  873. // Which means if we can make this pre-header empty, we can fuse
  874. // these loops
  875. if (!isEmptyPreheader(*FC1)) {
  876. LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty "
  877. "preheader.\n");
  878. // If it is not safe to hoist/sink all instructions in the
  879. // pre-header, we cannot fuse these loops.
  880. if (!collectMovablePreheaderInsts(*FC0, *FC1, SafeToHoist,
  881. SafeToSink)) {
  882. LLVM_DEBUG(dbgs() << "Could not hoist/sink all instructions in "
  883. "Fusion Candidate Pre-header.\n"
  884. << "Not Fusing.\n");
  885. reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
  886. NonEmptyPreheader);
  887. continue;
  888. }
  889. }
  890. bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1);
  891. LLVM_DEBUG(dbgs()
  892. << "\tFusion appears to be "
  893. << (BeneficialToFuse ? "" : "un") << "profitable!\n");
  894. if (!BeneficialToFuse) {
  895. reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
  896. FusionNotBeneficial);
  897. continue;
  898. }
  899. // All analysis has completed and has determined that fusion is legal
  900. // and profitable. At this point, start transforming the code and
  901. // perform fusion.
  902. // Execute the hoist/sink operations on preheader instructions
  903. movePreheaderInsts(*FC0, *FC1, SafeToHoist, SafeToSink);
  904. LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and "
  905. << *FC1 << "\n");
  906. FusionCandidate FC0Copy = *FC0;
  907. // Peel the loop after determining that fusion is legal. The Loops
  908. // will still be safe to fuse after the peeling is performed.
  909. bool Peel = TCDifference && *TCDifference > 0;
  910. if (Peel)
  911. peelFusionCandidate(FC0Copy, *FC1, *TCDifference);
  912. // Report fusion to the Optimization Remarks.
  913. // Note this needs to be done *before* performFusion because
  914. // performFusion will change the original loops, making it not
  915. // possible to identify them after fusion is complete.
  916. reportLoopFusion<OptimizationRemark>((Peel ? FC0Copy : *FC0), *FC1,
  917. FuseCounter);
  918. FusionCandidate FusedCand(
  919. performFusion((Peel ? FC0Copy : *FC0), *FC1), DT, &PDT, ORE,
  920. FC0Copy.PP);
  921. FusedCand.verify();
  922. assert(FusedCand.isEligibleForFusion(SE) &&
  923. "Fused candidate should be eligible for fusion!");
  924. // Notify the loop-depth-tree that these loops are not valid objects
  925. LDT.removeLoop(FC1->L);
  926. CandidateSet.erase(FC0);
  927. CandidateSet.erase(FC1);
  928. auto InsertPos = CandidateSet.insert(FusedCand);
  929. assert(InsertPos.second &&
  930. "Unable to insert TargetCandidate in CandidateSet!");
  931. // Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations
  932. // of the FC1 loop will attempt to fuse the new (fused) loop with the
  933. // remaining candidates in the current candidate set.
  934. FC0 = FC1 = InsertPos.first;
  935. LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet
  936. << "\n");
  937. Fused = true;
  938. }
  939. }
  940. }
  941. return Fused;
  942. }
  943. // Returns true if the instruction \p I can be hoisted to the end of the
  944. // preheader of \p FC0. \p SafeToHoist contains the instructions that are
  945. // known to be safe to hoist. The instructions encountered that cannot be
  946. // hoisted are in \p NotHoisting.
  947. // TODO: Move functionality into CodeMoverUtils
  948. bool canHoistInst(Instruction &I,
  949. const SmallVector<Instruction *, 4> &SafeToHoist,
  950. const SmallVector<Instruction *, 4> &NotHoisting,
  951. const FusionCandidate &FC0) const {
  952. const BasicBlock *FC0PreheaderTarget = FC0.Preheader->getSingleSuccessor();
  953. assert(FC0PreheaderTarget &&
  954. "Expected single successor for loop preheader.");
  955. for (Use &Op : I.operands()) {
  956. if (auto *OpInst = dyn_cast<Instruction>(Op)) {
  957. bool OpHoisted = is_contained(SafeToHoist, OpInst);
  958. // Check if we have already decided to hoist this operand. In this
  959. // case, it does not dominate FC0 *yet*, but will after we hoist it.
  960. if (!(OpHoisted || DT.dominates(OpInst, FC0PreheaderTarget))) {
  961. return false;
  962. }
  963. }
  964. }
  965. // PHIs in FC1's header only have FC0 blocks as predecessors. PHIs
  966. // cannot be hoisted and should be sunk to the exit of the fused loop.
  967. if (isa<PHINode>(I))
  968. return false;
  969. // If this isn't a memory inst, hoisting is safe
  970. if (!I.mayReadOrWriteMemory())
  971. return true;
  972. LLVM_DEBUG(dbgs() << "Checking if this mem inst can be hoisted.\n");
  973. for (Instruction *NotHoistedInst : NotHoisting) {
  974. if (auto D = DI.depends(&I, NotHoistedInst, true)) {
  975. // Dependency is not read-before-write, write-before-read or
  976. // write-before-write
  977. if (D->isFlow() || D->isAnti() || D->isOutput()) {
  978. LLVM_DEBUG(dbgs() << "Inst depends on an instruction in FC1's "
  979. "preheader that is not being hoisted.\n");
  980. return false;
  981. }
  982. }
  983. }
  984. for (Instruction *ReadInst : FC0.MemReads) {
  985. if (auto D = DI.depends(ReadInst, &I, true)) {
  986. // Dependency is not read-before-write
  987. if (D->isAnti()) {
  988. LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC0.\n");
  989. return false;
  990. }
  991. }
  992. }
  993. for (Instruction *WriteInst : FC0.MemWrites) {
  994. if (auto D = DI.depends(WriteInst, &I, true)) {
  995. // Dependency is not write-before-read or write-before-write
  996. if (D->isFlow() || D->isOutput()) {
  997. LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC0.\n");
  998. return false;
  999. }
  1000. }
  1001. }
  1002. return true;
  1003. }
  1004. // Returns true if the instruction \p I can be sunk to the top of the exit
  1005. // block of \p FC1.
  1006. // TODO: Move functionality into CodeMoverUtils
  1007. bool canSinkInst(Instruction &I, const FusionCandidate &FC1) const {
  1008. for (User *U : I.users()) {
  1009. if (auto *UI{dyn_cast<Instruction>(U)}) {
  1010. // Cannot sink if user in loop
  1011. // If FC1 has phi users of this value, we cannot sink it into FC1.
  1012. if (FC1.L->contains(UI)) {
  1013. // Cannot hoist or sink this instruction. No hoisting/sinking
  1014. // should take place, loops should not fuse
  1015. return false;
  1016. }
  1017. }
  1018. }
  1019. // If this isn't a memory inst, sinking is safe
  1020. if (!I.mayReadOrWriteMemory())
  1021. return true;
  1022. for (Instruction *ReadInst : FC1.MemReads) {
  1023. if (auto D = DI.depends(&I, ReadInst, true)) {
  1024. // Dependency is not write-before-read
  1025. if (D->isFlow()) {
  1026. LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC1.\n");
  1027. return false;
  1028. }
  1029. }
  1030. }
  1031. for (Instruction *WriteInst : FC1.MemWrites) {
  1032. if (auto D = DI.depends(&I, WriteInst, true)) {
  1033. // Dependency is not write-before-write or read-before-write
  1034. if (D->isOutput() || D->isAnti()) {
  1035. LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC1.\n");
  1036. return false;
  1037. }
  1038. }
  1039. }
  1040. return true;
  1041. }
  1042. /// Collect instructions in the \p FC1 Preheader that can be hoisted
  1043. /// to the \p FC0 Preheader or sunk into the \p FC1 Body
  1044. bool collectMovablePreheaderInsts(
  1045. const FusionCandidate &FC0, const FusionCandidate &FC1,
  1046. SmallVector<Instruction *, 4> &SafeToHoist,
  1047. SmallVector<Instruction *, 4> &SafeToSink) const {
  1048. BasicBlock *FC1Preheader = FC1.Preheader;
  1049. // Save the instructions that are not being hoisted, so we know not to hoist
  1050. // mem insts that they dominate.
  1051. SmallVector<Instruction *, 4> NotHoisting;
  1052. for (Instruction &I : *FC1Preheader) {
  1053. // Can't move a branch
  1054. if (&I == FC1Preheader->getTerminator())
  1055. continue;
  1056. // If the instruction has side-effects, give up.
  1057. // TODO: The case of mayReadFromMemory we can handle but requires
  1058. // additional work with a dependence analysis so for now we give
  1059. // up on memory reads.
  1060. if (I.mayThrow() || !I.willReturn()) {
  1061. LLVM_DEBUG(dbgs() << "Inst: " << I << " may throw or won't return.\n");
  1062. return false;
  1063. }
  1064. LLVM_DEBUG(dbgs() << "Checking Inst: " << I << "\n");
  1065. if (I.isAtomic() || I.isVolatile()) {
  1066. LLVM_DEBUG(
  1067. dbgs() << "\tInstruction is volatile or atomic. Cannot move it.\n");
  1068. return false;
  1069. }
  1070. if (canHoistInst(I, SafeToHoist, NotHoisting, FC0)) {
  1071. SafeToHoist.push_back(&I);
  1072. LLVM_DEBUG(dbgs() << "\tSafe to hoist.\n");
  1073. } else {
  1074. LLVM_DEBUG(dbgs() << "\tCould not hoist. Trying to sink...\n");
  1075. NotHoisting.push_back(&I);
  1076. if (canSinkInst(I, FC1)) {
  1077. SafeToSink.push_back(&I);
  1078. LLVM_DEBUG(dbgs() << "\tSafe to sink.\n");
  1079. } else {
  1080. LLVM_DEBUG(dbgs() << "\tCould not sink.\n");
  1081. return false;
  1082. }
  1083. }
  1084. }
  1085. LLVM_DEBUG(
  1086. dbgs() << "All preheader instructions could be sunk or hoisted!\n");
  1087. return true;
  1088. }
  1089. /// Rewrite all additive recurrences in a SCEV to use a new loop.
  1090. class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> {
  1091. public:
  1092. AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL,
  1093. bool UseMax = true)
  1094. : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL),
  1095. NewL(NewL) {}
  1096. const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
  1097. const Loop *ExprL = Expr->getLoop();
  1098. SmallVector<const SCEV *, 2> Operands;
  1099. if (ExprL == &OldL) {
  1100. append_range(Operands, Expr->operands());
  1101. return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags());
  1102. }
  1103. if (OldL.contains(ExprL)) {
  1104. bool Pos = SE.isKnownPositive(Expr->getStepRecurrence(SE));
  1105. if (!UseMax || !Pos || !Expr->isAffine()) {
  1106. Valid = false;
  1107. return Expr;
  1108. }
  1109. return visit(Expr->getStart());
  1110. }
  1111. for (const SCEV *Op : Expr->operands())
  1112. Operands.push_back(visit(Op));
  1113. return SE.getAddRecExpr(Operands, ExprL, Expr->getNoWrapFlags());
  1114. }
  1115. bool wasValidSCEV() const { return Valid; }
  1116. private:
  1117. bool Valid, UseMax;
  1118. const Loop &OldL, &NewL;
  1119. };
  1120. /// Return false if the access functions of \p I0 and \p I1 could cause
  1121. /// a negative dependence.
  1122. bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0,
  1123. Instruction &I1, bool EqualIsInvalid) {
  1124. Value *Ptr0 = getLoadStorePointerOperand(&I0);
  1125. Value *Ptr1 = getLoadStorePointerOperand(&I1);
  1126. if (!Ptr0 || !Ptr1)
  1127. return false;
  1128. const SCEV *SCEVPtr0 = SE.getSCEVAtScope(Ptr0, &L0);
  1129. const SCEV *SCEVPtr1 = SE.getSCEVAtScope(Ptr1, &L1);
  1130. #ifndef NDEBUG
  1131. if (VerboseFusionDebugging)
  1132. LLVM_DEBUG(dbgs() << " Access function check: " << *SCEVPtr0 << " vs "
  1133. << *SCEVPtr1 << "\n");
  1134. #endif
  1135. AddRecLoopReplacer Rewriter(SE, L0, L1);
  1136. SCEVPtr0 = Rewriter.visit(SCEVPtr0);
  1137. #ifndef NDEBUG
  1138. if (VerboseFusionDebugging)
  1139. LLVM_DEBUG(dbgs() << " Access function after rewrite: " << *SCEVPtr0
  1140. << " [Valid: " << Rewriter.wasValidSCEV() << "]\n");
  1141. #endif
  1142. if (!Rewriter.wasValidSCEV())
  1143. return false;
  1144. // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by
  1145. // L0) and the other is not. We could check if it is monotone and test
  1146. // the beginning and end value instead.
  1147. BasicBlock *L0Header = L0.getHeader();
  1148. auto HasNonLinearDominanceRelation = [&](const SCEV *S) {
  1149. const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S);
  1150. if (!AddRec)
  1151. return false;
  1152. return !DT.dominates(L0Header, AddRec->getLoop()->getHeader()) &&
  1153. !DT.dominates(AddRec->getLoop()->getHeader(), L0Header);
  1154. };
  1155. if (SCEVExprContains(SCEVPtr1, HasNonLinearDominanceRelation))
  1156. return false;
  1157. ICmpInst::Predicate Pred =
  1158. EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE;
  1159. bool IsAlwaysGE = SE.isKnownPredicate(Pred, SCEVPtr0, SCEVPtr1);
  1160. #ifndef NDEBUG
  1161. if (VerboseFusionDebugging)
  1162. LLVM_DEBUG(dbgs() << " Relation: " << *SCEVPtr0
  1163. << (IsAlwaysGE ? " >= " : " may < ") << *SCEVPtr1
  1164. << "\n");
  1165. #endif
  1166. return IsAlwaysGE;
  1167. }
  1168. /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in
  1169. /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses
  1170. /// specified by @p DepChoice are used to determine this.
  1171. bool dependencesAllowFusion(const FusionCandidate &FC0,
  1172. const FusionCandidate &FC1, Instruction &I0,
  1173. Instruction &I1, bool AnyDep,
  1174. FusionDependenceAnalysisChoice DepChoice) {
  1175. #ifndef NDEBUG
  1176. if (VerboseFusionDebugging) {
  1177. LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : "
  1178. << DepChoice << "\n");
  1179. }
  1180. #endif
  1181. switch (DepChoice) {
  1182. case FUSION_DEPENDENCE_ANALYSIS_SCEV:
  1183. return accessDiffIsPositive(*FC0.L, *FC1.L, I0, I1, AnyDep);
  1184. case FUSION_DEPENDENCE_ANALYSIS_DA: {
  1185. auto DepResult = DI.depends(&I0, &I1, true);
  1186. if (!DepResult)
  1187. return true;
  1188. #ifndef NDEBUG
  1189. if (VerboseFusionDebugging) {
  1190. LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs());
  1191. dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: "
  1192. << (DepResult->isOrdered() ? "true" : "false")
  1193. << "]\n");
  1194. LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels()
  1195. << "\n");
  1196. }
  1197. #endif
  1198. if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor())
  1199. LLVM_DEBUG(
  1200. dbgs() << "TODO: Implement pred/succ dependence handling!\n");
  1201. // TODO: Can we actually use the dependence info analysis here?
  1202. return false;
  1203. }
  1204. case FUSION_DEPENDENCE_ANALYSIS_ALL:
  1205. return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,
  1206. FUSION_DEPENDENCE_ANALYSIS_SCEV) ||
  1207. dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,
  1208. FUSION_DEPENDENCE_ANALYSIS_DA);
  1209. }
  1210. llvm_unreachable("Unknown fusion dependence analysis choice!");
  1211. }
  1212. /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused.
  1213. bool dependencesAllowFusion(const FusionCandidate &FC0,
  1214. const FusionCandidate &FC1) {
  1215. LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1
  1216. << "\n");
  1217. assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth());
  1218. assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock()));
  1219. for (Instruction *WriteL0 : FC0.MemWrites) {
  1220. for (Instruction *WriteL1 : FC1.MemWrites)
  1221. if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1,
  1222. /* AnyDep */ false,
  1223. FusionDependenceAnalysis)) {
  1224. InvalidDependencies++;
  1225. return false;
  1226. }
  1227. for (Instruction *ReadL1 : FC1.MemReads)
  1228. if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1,
  1229. /* AnyDep */ false,
  1230. FusionDependenceAnalysis)) {
  1231. InvalidDependencies++;
  1232. return false;
  1233. }
  1234. }
  1235. for (Instruction *WriteL1 : FC1.MemWrites) {
  1236. for (Instruction *WriteL0 : FC0.MemWrites)
  1237. if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1,
  1238. /* AnyDep */ false,
  1239. FusionDependenceAnalysis)) {
  1240. InvalidDependencies++;
  1241. return false;
  1242. }
  1243. for (Instruction *ReadL0 : FC0.MemReads)
  1244. if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1,
  1245. /* AnyDep */ false,
  1246. FusionDependenceAnalysis)) {
  1247. InvalidDependencies++;
  1248. return false;
  1249. }
  1250. }
  1251. // Walk through all uses in FC1. For each use, find the reaching def. If the
  1252. // def is located in FC0 then it is is not safe to fuse.
  1253. for (BasicBlock *BB : FC1.L->blocks())
  1254. for (Instruction &I : *BB)
  1255. for (auto &Op : I.operands())
  1256. if (Instruction *Def = dyn_cast<Instruction>(Op))
  1257. if (FC0.L->contains(Def->getParent())) {
  1258. InvalidDependencies++;
  1259. return false;
  1260. }
  1261. return true;
  1262. }
  1263. /// Determine if two fusion candidates are adjacent in the CFG.
  1264. ///
  1265. /// This method will determine if there are additional basic blocks in the CFG
  1266. /// between the exit of \p FC0 and the entry of \p FC1.
  1267. /// If the two candidates are guarded loops, then it checks whether the
  1268. /// non-loop successor of the \p FC0 guard branch is the entry block of \p
  1269. /// FC1. If not, then the loops are not adjacent. If the two candidates are
  1270. /// not guarded loops, then it checks whether the exit block of \p FC0 is the
  1271. /// preheader of \p FC1.
  1272. bool isAdjacent(const FusionCandidate &FC0,
  1273. const FusionCandidate &FC1) const {
  1274. // If the successor of the guard branch is FC1, then the loops are adjacent
  1275. if (FC0.GuardBranch)
  1276. return FC0.getNonLoopBlock() == FC1.getEntryBlock();
  1277. else
  1278. return FC0.ExitBlock == FC1.getEntryBlock();
  1279. }
  1280. bool isEmptyPreheader(const FusionCandidate &FC) const {
  1281. return FC.Preheader->size() == 1;
  1282. }
  1283. /// Hoist \p FC1 Preheader instructions to \p FC0 Preheader
  1284. /// and sink others into the body of \p FC1.
  1285. void movePreheaderInsts(const FusionCandidate &FC0,
  1286. const FusionCandidate &FC1,
  1287. SmallVector<Instruction *, 4> &HoistInsts,
  1288. SmallVector<Instruction *, 4> &SinkInsts) const {
  1289. // All preheader instructions except the branch must be hoisted or sunk
  1290. assert(HoistInsts.size() + SinkInsts.size() == FC1.Preheader->size() - 1 &&
  1291. "Attempting to sink and hoist preheader instructions, but not all "
  1292. "the preheader instructions are accounted for.");
  1293. NumHoistedInsts += HoistInsts.size();
  1294. NumSunkInsts += SinkInsts.size();
  1295. LLVM_DEBUG(if (VerboseFusionDebugging) {
  1296. if (!HoistInsts.empty())
  1297. dbgs() << "Hoisting: \n";
  1298. for (Instruction *I : HoistInsts)
  1299. dbgs() << *I << "\n";
  1300. if (!SinkInsts.empty())
  1301. dbgs() << "Sinking: \n";
  1302. for (Instruction *I : SinkInsts)
  1303. dbgs() << *I << "\n";
  1304. });
  1305. for (Instruction *I : HoistInsts) {
  1306. assert(I->getParent() == FC1.Preheader);
  1307. I->moveBefore(FC0.Preheader->getTerminator());
  1308. }
  1309. // insert instructions in reverse order to maintain dominance relationship
  1310. for (Instruction *I : reverse(SinkInsts)) {
  1311. assert(I->getParent() == FC1.Preheader);
  1312. I->moveBefore(&*FC1.ExitBlock->getFirstInsertionPt());
  1313. }
  1314. }
  1315. /// Determine if two fusion candidates have identical guards
  1316. ///
  1317. /// This method will determine if two fusion candidates have the same guards.
  1318. /// The guards are considered the same if:
  1319. /// 1. The instructions to compute the condition used in the compare are
  1320. /// identical.
  1321. /// 2. The successors of the guard have the same flow into/around the loop.
  1322. /// If the compare instructions are identical, then the first successor of the
  1323. /// guard must go to the same place (either the preheader of the loop or the
  1324. /// NonLoopBlock). In other words, the the first successor of both loops must
  1325. /// both go into the loop (i.e., the preheader) or go around the loop (i.e.,
  1326. /// the NonLoopBlock). The same must be true for the second successor.
  1327. bool haveIdenticalGuards(const FusionCandidate &FC0,
  1328. const FusionCandidate &FC1) const {
  1329. assert(FC0.GuardBranch && FC1.GuardBranch &&
  1330. "Expecting FC0 and FC1 to be guarded loops.");
  1331. if (auto FC0CmpInst =
  1332. dyn_cast<Instruction>(FC0.GuardBranch->getCondition()))
  1333. if (auto FC1CmpInst =
  1334. dyn_cast<Instruction>(FC1.GuardBranch->getCondition()))
  1335. if (!FC0CmpInst->isIdenticalTo(FC1CmpInst))
  1336. return false;
  1337. // The compare instructions are identical.
  1338. // Now make sure the successor of the guards have the same flow into/around
  1339. // the loop
  1340. if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader)
  1341. return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader);
  1342. else
  1343. return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader);
  1344. }
  1345. /// Modify the latch branch of FC to be unconditional since successors of the
  1346. /// branch are the same.
  1347. void simplifyLatchBranch(const FusionCandidate &FC) const {
  1348. BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator());
  1349. if (FCLatchBranch) {
  1350. assert(FCLatchBranch->isConditional() &&
  1351. FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) &&
  1352. "Expecting the two successors of FCLatchBranch to be the same");
  1353. BranchInst *NewBranch =
  1354. BranchInst::Create(FCLatchBranch->getSuccessor(0));
  1355. ReplaceInstWithInst(FCLatchBranch, NewBranch);
  1356. }
  1357. }
  1358. /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique
  1359. /// successor, then merge FC0.Latch with its unique successor.
  1360. void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) {
  1361. moveInstructionsToTheBeginning(*FC0.Latch, *FC1.Latch, DT, PDT, DI);
  1362. if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) {
  1363. MergeBlockIntoPredecessor(Succ, &DTU, &LI);
  1364. DTU.flush();
  1365. }
  1366. }
  1367. /// Fuse two fusion candidates, creating a new fused loop.
  1368. ///
  1369. /// This method contains the mechanics of fusing two loops, represented by \p
  1370. /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1
  1371. /// postdominates \p FC0 (making them control flow equivalent). It also
  1372. /// assumes that the other conditions for fusion have been met: adjacent,
  1373. /// identical trip counts, and no negative distance dependencies exist that
  1374. /// would prevent fusion. Thus, there is no checking for these conditions in
  1375. /// this method.
  1376. ///
  1377. /// Fusion is performed by rewiring the CFG to update successor blocks of the
  1378. /// components of tho loop. Specifically, the following changes are done:
  1379. ///
  1380. /// 1. The preheader of \p FC1 is removed as it is no longer necessary
  1381. /// (because it is currently only a single statement block).
  1382. /// 2. The latch of \p FC0 is modified to jump to the header of \p FC1.
  1383. /// 3. The latch of \p FC1 i modified to jump to the header of \p FC0.
  1384. /// 4. All blocks from \p FC1 are removed from FC1 and added to FC0.
  1385. ///
  1386. /// All of these modifications are done with dominator tree updates, thus
  1387. /// keeping the dominator (and post dominator) information up-to-date.
  1388. ///
  1389. /// This can be improved in the future by actually merging blocks during
  1390. /// fusion. For example, the preheader of \p FC1 can be merged with the
  1391. /// preheader of \p FC0. This would allow loops with more than a single
  1392. /// statement in the preheader to be fused. Similarly, the latch blocks of the
  1393. /// two loops could also be fused into a single block. This will require
  1394. /// analysis to prove it is safe to move the contents of the block past
  1395. /// existing code, which currently has not been implemented.
  1396. Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) {
  1397. assert(FC0.isValid() && FC1.isValid() &&
  1398. "Expecting valid fusion candidates");
  1399. LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump();
  1400. dbgs() << "Fusion Candidate 1: \n"; FC1.dump(););
  1401. // Move instructions from the preheader of FC1 to the end of the preheader
  1402. // of FC0.
  1403. moveInstructionsToTheEnd(*FC1.Preheader, *FC0.Preheader, DT, PDT, DI);
  1404. // Fusing guarded loops is handled slightly differently than non-guarded
  1405. // loops and has been broken out into a separate method instead of trying to
  1406. // intersperse the logic within a single method.
  1407. if (FC0.GuardBranch)
  1408. return fuseGuardedLoops(FC0, FC1);
  1409. assert(FC1.Preheader ==
  1410. (FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock));
  1411. assert(FC1.Preheader->size() == 1 &&
  1412. FC1.Preheader->getSingleSuccessor() == FC1.Header);
  1413. // Remember the phi nodes originally in the header of FC0 in order to rewire
  1414. // them later. However, this is only necessary if the new loop carried
  1415. // values might not dominate the exiting branch. While we do not generally
  1416. // test if this is the case but simply insert intermediate phi nodes, we
  1417. // need to make sure these intermediate phi nodes have different
  1418. // predecessors. To this end, we filter the special case where the exiting
  1419. // block is the latch block of the first loop. Nothing needs to be done
  1420. // anyway as all loop carried values dominate the latch and thereby also the
  1421. // exiting branch.
  1422. SmallVector<PHINode *, 8> OriginalFC0PHIs;
  1423. if (FC0.ExitingBlock != FC0.Latch)
  1424. for (PHINode &PHI : FC0.Header->phis())
  1425. OriginalFC0PHIs.push_back(&PHI);
  1426. // Replace incoming blocks for header PHIs first.
  1427. FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
  1428. FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
  1429. // Then modify the control flow and update DT and PDT.
  1430. SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
  1431. // The old exiting block of the first loop (FC0) has to jump to the header
  1432. // of the second as we need to execute the code in the second header block
  1433. // regardless of the trip count. That is, if the trip count is 0, so the
  1434. // back edge is never taken, we still have to execute both loop headers,
  1435. // especially (but not only!) if the second is a do-while style loop.
  1436. // However, doing so might invalidate the phi nodes of the first loop as
  1437. // the new values do only need to dominate their latch and not the exiting
  1438. // predicate. To remedy this potential problem we always introduce phi
  1439. // nodes in the header of the second loop later that select the loop carried
  1440. // value, if the second header was reached through an old latch of the
  1441. // first, or undef otherwise. This is sound as exiting the first implies the
  1442. // second will exit too, __without__ taking the back-edge. [Their
  1443. // trip-counts are equal after all.
  1444. // KB: Would this sequence be simpler to just just make FC0.ExitingBlock go
  1445. // to FC1.Header? I think this is basically what the three sequences are
  1446. // trying to accomplish; however, doing this directly in the CFG may mean
  1447. // the DT/PDT becomes invalid
  1448. if (!FC0.Peeled) {
  1449. FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader,
  1450. FC1.Header);
  1451. TreeUpdates.emplace_back(DominatorTree::UpdateType(
  1452. DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader));
  1453. TreeUpdates.emplace_back(DominatorTree::UpdateType(
  1454. DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
  1455. } else {
  1456. TreeUpdates.emplace_back(DominatorTree::UpdateType(
  1457. DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader));
  1458. // Remove the ExitBlock of the first Loop (also not needed)
  1459. FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
  1460. FC1.Header);
  1461. TreeUpdates.emplace_back(DominatorTree::UpdateType(
  1462. DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
  1463. FC0.ExitBlock->getTerminator()->eraseFromParent();
  1464. TreeUpdates.emplace_back(DominatorTree::UpdateType(
  1465. DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
  1466. new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
  1467. }
  1468. // The pre-header of L1 is not necessary anymore.
  1469. assert(pred_empty(FC1.Preheader));
  1470. FC1.Preheader->getTerminator()->eraseFromParent();
  1471. new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
  1472. TreeUpdates.emplace_back(DominatorTree::UpdateType(
  1473. DominatorTree::Delete, FC1.Preheader, FC1.Header));
  1474. // Moves the phi nodes from the second to the first loops header block.
  1475. while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
  1476. if (SE.isSCEVable(PHI->getType()))
  1477. SE.forgetValue(PHI);
  1478. if (PHI->hasNUsesOrMore(1))
  1479. PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());
  1480. else
  1481. PHI->eraseFromParent();
  1482. }
  1483. // Introduce new phi nodes in the second loop header to ensure
  1484. // exiting the first and jumping to the header of the second does not break
  1485. // the SSA property of the phis originally in the first loop. See also the
  1486. // comment above.
  1487. Instruction *L1HeaderIP = &FC1.Header->front();
  1488. for (PHINode *LCPHI : OriginalFC0PHIs) {
  1489. int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
  1490. assert(L1LatchBBIdx >= 0 &&
  1491. "Expected loop carried value to be rewired at this point!");
  1492. Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
  1493. PHINode *L1HeaderPHI = PHINode::Create(
  1494. LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP);
  1495. L1HeaderPHI->addIncoming(LCV, FC0.Latch);
  1496. L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
  1497. FC0.ExitingBlock);
  1498. LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
  1499. }
  1500. // Replace latch terminator destinations.
  1501. FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
  1502. FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
  1503. // Modify the latch branch of FC0 to be unconditional as both successors of
  1504. // the branch are the same.
  1505. simplifyLatchBranch(FC0);
  1506. // If FC0.Latch and FC0.ExitingBlock are the same then we have already
  1507. // performed the updates above.
  1508. if (FC0.Latch != FC0.ExitingBlock)
  1509. TreeUpdates.emplace_back(DominatorTree::UpdateType(
  1510. DominatorTree::Insert, FC0.Latch, FC1.Header));
  1511. TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
  1512. FC0.Latch, FC0.Header));
  1513. TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,
  1514. FC1.Latch, FC0.Header));
  1515. TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
  1516. FC1.Latch, FC1.Header));
  1517. // Update DT/PDT
  1518. DTU.applyUpdates(TreeUpdates);
  1519. LI.removeBlock(FC1.Preheader);
  1520. DTU.deleteBB(FC1.Preheader);
  1521. if (FC0.Peeled) {
  1522. LI.removeBlock(FC0.ExitBlock);
  1523. DTU.deleteBB(FC0.ExitBlock);
  1524. }
  1525. DTU.flush();
  1526. // Is there a way to keep SE up-to-date so we don't need to forget the loops
  1527. // and rebuild the information in subsequent passes of fusion?
  1528. // Note: Need to forget the loops before merging the loop latches, as
  1529. // mergeLatch may remove the only block in FC1.
  1530. SE.forgetLoop(FC1.L);
  1531. SE.forgetLoop(FC0.L);
  1532. SE.forgetLoopDispositions();
  1533. // Move instructions from FC0.Latch to FC1.Latch.
  1534. // Note: mergeLatch requires an updated DT.
  1535. mergeLatch(FC0, FC1);
  1536. // Merge the loops.
  1537. SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
  1538. for (BasicBlock *BB : Blocks) {
  1539. FC0.L->addBlockEntry(BB);
  1540. FC1.L->removeBlockFromLoop(BB);
  1541. if (LI.getLoopFor(BB) != FC1.L)
  1542. continue;
  1543. LI.changeLoopFor(BB, FC0.L);
  1544. }
  1545. while (!FC1.L->isInnermost()) {
  1546. const auto &ChildLoopIt = FC1.L->begin();
  1547. Loop *ChildLoop = *ChildLoopIt;
  1548. FC1.L->removeChildLoop(ChildLoopIt);
  1549. FC0.L->addChildLoop(ChildLoop);
  1550. }
  1551. // Delete the now empty loop L1.
  1552. LI.erase(FC1.L);
  1553. #ifndef NDEBUG
  1554. assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
  1555. assert(DT.verify(DominatorTree::VerificationLevel::Fast));
  1556. assert(PDT.verify());
  1557. LI.verify(DT);
  1558. SE.verify();
  1559. #endif
  1560. LLVM_DEBUG(dbgs() << "Fusion done:\n");
  1561. return FC0.L;
  1562. }
  1563. /// Report details on loop fusion opportunities.
  1564. ///
  1565. /// This template function can be used to report both successful and missed
  1566. /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should
  1567. /// be one of:
  1568. /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful
  1569. /// given two valid fusion candidates.
  1570. /// - OptimizationRemark to report successful fusion of two fusion
  1571. /// candidates.
  1572. /// The remarks will be printed using the form:
  1573. /// <path/filename>:<line number>:<column number>: [<function name>]:
  1574. /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description>
  1575. template <typename RemarkKind>
  1576. void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1,
  1577. llvm::Statistic &Stat) {
  1578. assert(FC0.Preheader && FC1.Preheader &&
  1579. "Expecting valid fusion candidates");
  1580. using namespace ore;
  1581. #if LLVM_ENABLE_STATS
  1582. ++Stat;
  1583. ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(),
  1584. FC0.Preheader)
  1585. << "[" << FC0.Preheader->getParent()->getName()
  1586. << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName()))
  1587. << " and " << NV("Cand2", StringRef(FC1.Preheader->getName()))
  1588. << ": " << Stat.getDesc());
  1589. #endif
  1590. }
  1591. /// Fuse two guarded fusion candidates, creating a new fused loop.
  1592. ///
  1593. /// Fusing guarded loops is handled much the same way as fusing non-guarded
  1594. /// loops. The rewiring of the CFG is slightly different though, because of
  1595. /// the presence of the guards around the loops and the exit blocks after the
  1596. /// loop body. As such, the new loop is rewired as follows:
  1597. /// 1. Keep the guard branch from FC0 and use the non-loop block target
  1598. /// from the FC1 guard branch.
  1599. /// 2. Remove the exit block from FC0 (this exit block should be empty
  1600. /// right now).
  1601. /// 3. Remove the guard branch for FC1
  1602. /// 4. Remove the preheader for FC1.
  1603. /// The exit block successor for the latch of FC0 is updated to be the header
  1604. /// of FC1 and the non-exit block successor of the latch of FC1 is updated to
  1605. /// be the header of FC0, thus creating the fused loop.
  1606. Loop *fuseGuardedLoops(const FusionCandidate &FC0,
  1607. const FusionCandidate &FC1) {
  1608. assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops");
  1609. BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent();
  1610. BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent();
  1611. BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();
  1612. BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();
  1613. BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor();
  1614. // Move instructions from the exit block of FC0 to the beginning of the exit
  1615. // block of FC1, in the case that the FC0 loop has not been peeled. In the
  1616. // case that FC0 loop is peeled, then move the instructions of the successor
  1617. // of the FC0 Exit block to the beginning of the exit block of FC1.
  1618. moveInstructionsToTheBeginning(
  1619. (FC0.Peeled ? *FC0ExitBlockSuccessor : *FC0.ExitBlock), *FC1.ExitBlock,
  1620. DT, PDT, DI);
  1621. // Move instructions from the guard block of FC1 to the end of the guard
  1622. // block of FC0.
  1623. moveInstructionsToTheEnd(*FC1GuardBlock, *FC0GuardBlock, DT, PDT, DI);
  1624. assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent");
  1625. SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
  1626. ////////////////////////////////////////////////////////////////////////////
  1627. // Update the Loop Guard
  1628. ////////////////////////////////////////////////////////////////////////////
  1629. // The guard for FC0 is updated to guard both FC0 and FC1. This is done by
  1630. // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1.
  1631. // Thus, one path from the guard goes to the preheader for FC0 (and thus
  1632. // executes the new fused loop) and the other path goes to the NonLoopBlock
  1633. // for FC1 (where FC1 guard would have gone if FC1 was not executed).
  1634. FC1NonLoopBlock->replacePhiUsesWith(FC1GuardBlock, FC0GuardBlock);
  1635. FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock);
  1636. BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock;
  1637. BBToUpdate->getTerminator()->replaceUsesOfWith(FC1GuardBlock, FC1.Header);
  1638. // The guard of FC1 is not necessary anymore.
  1639. FC1.GuardBranch->eraseFromParent();
  1640. new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock);
  1641. TreeUpdates.emplace_back(DominatorTree::UpdateType(
  1642. DominatorTree::Delete, FC1GuardBlock, FC1.Preheader));
  1643. TreeUpdates.emplace_back(DominatorTree::UpdateType(
  1644. DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock));
  1645. TreeUpdates.emplace_back(DominatorTree::UpdateType(
  1646. DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock));
  1647. TreeUpdates.emplace_back(DominatorTree::UpdateType(
  1648. DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock));
  1649. if (FC0.Peeled) {
  1650. // Remove the Block after the ExitBlock of FC0
  1651. TreeUpdates.emplace_back(DominatorTree::UpdateType(
  1652. DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock));
  1653. FC0ExitBlockSuccessor->getTerminator()->eraseFromParent();
  1654. new UnreachableInst(FC0ExitBlockSuccessor->getContext(),
  1655. FC0ExitBlockSuccessor);
  1656. }
  1657. assert(pred_empty(FC1GuardBlock) &&
  1658. "Expecting guard block to have no predecessors");
  1659. assert(succ_empty(FC1GuardBlock) &&
  1660. "Expecting guard block to have no successors");
  1661. // Remember the phi nodes originally in the header of FC0 in order to rewire
  1662. // them later. However, this is only necessary if the new loop carried
  1663. // values might not dominate the exiting branch. While we do not generally
  1664. // test if this is the case but simply insert intermediate phi nodes, we
  1665. // need to make sure these intermediate phi nodes have different
  1666. // predecessors. To this end, we filter the special case where the exiting
  1667. // block is the latch block of the first loop. Nothing needs to be done
  1668. // anyway as all loop carried values dominate the latch and thereby also the
  1669. // exiting branch.
  1670. // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch
  1671. // (because the loops are rotated. Thus, nothing will ever be added to
  1672. // OriginalFC0PHIs.
  1673. SmallVector<PHINode *, 8> OriginalFC0PHIs;
  1674. if (FC0.ExitingBlock != FC0.Latch)
  1675. for (PHINode &PHI : FC0.Header->phis())
  1676. OriginalFC0PHIs.push_back(&PHI);
  1677. assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!");
  1678. // Replace incoming blocks for header PHIs first.
  1679. FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
  1680. FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
  1681. // The old exiting block of the first loop (FC0) has to jump to the header
  1682. // of the second as we need to execute the code in the second header block
  1683. // regardless of the trip count. That is, if the trip count is 0, so the
  1684. // back edge is never taken, we still have to execute both loop headers,
  1685. // especially (but not only!) if the second is a do-while style loop.
  1686. // However, doing so might invalidate the phi nodes of the first loop as
  1687. // the new values do only need to dominate their latch and not the exiting
  1688. // predicate. To remedy this potential problem we always introduce phi
  1689. // nodes in the header of the second loop later that select the loop carried
  1690. // value, if the second header was reached through an old latch of the
  1691. // first, or undef otherwise. This is sound as exiting the first implies the
  1692. // second will exit too, __without__ taking the back-edge (their
  1693. // trip-counts are equal after all).
  1694. FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
  1695. FC1.Header);
  1696. TreeUpdates.emplace_back(DominatorTree::UpdateType(
  1697. DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
  1698. TreeUpdates.emplace_back(DominatorTree::UpdateType(
  1699. DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
  1700. // Remove FC0 Exit Block
  1701. // The exit block for FC0 is no longer needed since control will flow
  1702. // directly to the header of FC1. Since it is an empty block, it can be
  1703. // removed at this point.
  1704. // TODO: In the future, we can handle non-empty exit blocks my merging any
  1705. // instructions from FC0 exit block into FC1 exit block prior to removing
  1706. // the block.
  1707. assert(pred_empty(FC0.ExitBlock) && "Expecting exit block to be empty");
  1708. FC0.ExitBlock->getTerminator()->eraseFromParent();
  1709. new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
  1710. // Remove FC1 Preheader
  1711. // The pre-header of L1 is not necessary anymore.
  1712. assert(pred_empty(FC1.Preheader));
  1713. FC1.Preheader->getTerminator()->eraseFromParent();
  1714. new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
  1715. TreeUpdates.emplace_back(DominatorTree::UpdateType(
  1716. DominatorTree::Delete, FC1.Preheader, FC1.Header));
  1717. // Moves the phi nodes from the second to the first loops header block.
  1718. while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
  1719. if (SE.isSCEVable(PHI->getType()))
  1720. SE.forgetValue(PHI);
  1721. if (PHI->hasNUsesOrMore(1))
  1722. PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());
  1723. else
  1724. PHI->eraseFromParent();
  1725. }
  1726. // Introduce new phi nodes in the second loop header to ensure
  1727. // exiting the first and jumping to the header of the second does not break
  1728. // the SSA property of the phis originally in the first loop. See also the
  1729. // comment above.
  1730. Instruction *L1HeaderIP = &FC1.Header->front();
  1731. for (PHINode *LCPHI : OriginalFC0PHIs) {
  1732. int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
  1733. assert(L1LatchBBIdx >= 0 &&
  1734. "Expected loop carried value to be rewired at this point!");
  1735. Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
  1736. PHINode *L1HeaderPHI = PHINode::Create(
  1737. LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP);
  1738. L1HeaderPHI->addIncoming(LCV, FC0.Latch);
  1739. L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
  1740. FC0.ExitingBlock);
  1741. LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
  1742. }
  1743. // Update the latches
  1744. // Replace latch terminator destinations.
  1745. FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
  1746. FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
  1747. // Modify the latch branch of FC0 to be unconditional as both successors of
  1748. // the branch are the same.
  1749. simplifyLatchBranch(FC0);
  1750. // If FC0.Latch and FC0.ExitingBlock are the same then we have already
  1751. // performed the updates above.
  1752. if (FC0.Latch != FC0.ExitingBlock)
  1753. TreeUpdates.emplace_back(DominatorTree::UpdateType(
  1754. DominatorTree::Insert, FC0.Latch, FC1.Header));
  1755. TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
  1756. FC0.Latch, FC0.Header));
  1757. TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,
  1758. FC1.Latch, FC0.Header));
  1759. TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
  1760. FC1.Latch, FC1.Header));
  1761. // All done
  1762. // Apply the updates to the Dominator Tree and cleanup.
  1763. assert(succ_empty(FC1GuardBlock) && "FC1GuardBlock has successors!!");
  1764. assert(pred_empty(FC1GuardBlock) && "FC1GuardBlock has predecessors!!");
  1765. // Update DT/PDT
  1766. DTU.applyUpdates(TreeUpdates);
  1767. LI.removeBlock(FC1GuardBlock);
  1768. LI.removeBlock(FC1.Preheader);
  1769. LI.removeBlock(FC0.ExitBlock);
  1770. if (FC0.Peeled) {
  1771. LI.removeBlock(FC0ExitBlockSuccessor);
  1772. DTU.deleteBB(FC0ExitBlockSuccessor);
  1773. }
  1774. DTU.deleteBB(FC1GuardBlock);
  1775. DTU.deleteBB(FC1.Preheader);
  1776. DTU.deleteBB(FC0.ExitBlock);
  1777. DTU.flush();
  1778. // Is there a way to keep SE up-to-date so we don't need to forget the loops
  1779. // and rebuild the information in subsequent passes of fusion?
  1780. // Note: Need to forget the loops before merging the loop latches, as
  1781. // mergeLatch may remove the only block in FC1.
  1782. SE.forgetLoop(FC1.L);
  1783. SE.forgetLoop(FC0.L);
  1784. SE.forgetLoopDispositions();
  1785. // Move instructions from FC0.Latch to FC1.Latch.
  1786. // Note: mergeLatch requires an updated DT.
  1787. mergeLatch(FC0, FC1);
  1788. // Merge the loops.
  1789. SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
  1790. for (BasicBlock *BB : Blocks) {
  1791. FC0.L->addBlockEntry(BB);
  1792. FC1.L->removeBlockFromLoop(BB);
  1793. if (LI.getLoopFor(BB) != FC1.L)
  1794. continue;
  1795. LI.changeLoopFor(BB, FC0.L);
  1796. }
  1797. while (!FC1.L->isInnermost()) {
  1798. const auto &ChildLoopIt = FC1.L->begin();
  1799. Loop *ChildLoop = *ChildLoopIt;
  1800. FC1.L->removeChildLoop(ChildLoopIt);
  1801. FC0.L->addChildLoop(ChildLoop);
  1802. }
  1803. // Delete the now empty loop L1.
  1804. LI.erase(FC1.L);
  1805. #ifndef NDEBUG
  1806. assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
  1807. assert(DT.verify(DominatorTree::VerificationLevel::Fast));
  1808. assert(PDT.verify());
  1809. LI.verify(DT);
  1810. SE.verify();
  1811. #endif
  1812. LLVM_DEBUG(dbgs() << "Fusion done:\n");
  1813. return FC0.L;
  1814. }
  1815. };
  1816. struct LoopFuseLegacy : public FunctionPass {
  1817. static char ID;
  1818. LoopFuseLegacy() : FunctionPass(ID) {
  1819. initializeLoopFuseLegacyPass(*PassRegistry::getPassRegistry());
  1820. }
  1821. void getAnalysisUsage(AnalysisUsage &AU) const override {
  1822. AU.addRequiredID(LoopSimplifyID);
  1823. AU.addRequired<ScalarEvolutionWrapperPass>();
  1824. AU.addRequired<LoopInfoWrapperPass>();
  1825. AU.addRequired<DominatorTreeWrapperPass>();
  1826. AU.addRequired<PostDominatorTreeWrapperPass>();
  1827. AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
  1828. AU.addRequired<DependenceAnalysisWrapperPass>();
  1829. AU.addRequired<AssumptionCacheTracker>();
  1830. AU.addRequired<TargetTransformInfoWrapperPass>();
  1831. AU.addPreserved<ScalarEvolutionWrapperPass>();
  1832. AU.addPreserved<LoopInfoWrapperPass>();
  1833. AU.addPreserved<DominatorTreeWrapperPass>();
  1834. AU.addPreserved<PostDominatorTreeWrapperPass>();
  1835. }
  1836. bool runOnFunction(Function &F) override {
  1837. if (skipFunction(F))
  1838. return false;
  1839. auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  1840. auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  1841. auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI();
  1842. auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  1843. auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
  1844. auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
  1845. auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
  1846. const TargetTransformInfo &TTI =
  1847. getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
  1848. const DataLayout &DL = F.getParent()->getDataLayout();
  1849. LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
  1850. return LF.fuseLoops(F);
  1851. }
  1852. };
  1853. } // namespace
  1854. PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) {
  1855. auto &LI = AM.getResult<LoopAnalysis>(F);
  1856. auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
  1857. auto &DI = AM.getResult<DependenceAnalysis>(F);
  1858. auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
  1859. auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
  1860. auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
  1861. auto &AC = AM.getResult<AssumptionAnalysis>(F);
  1862. const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
  1863. const DataLayout &DL = F.getParent()->getDataLayout();
  1864. // Ensure loops are in simplifed form which is a pre-requisite for loop fusion
  1865. // pass. Added only for new PM since the legacy PM has already added
  1866. // LoopSimplify pass as a dependency.
  1867. bool Changed = false;
  1868. for (auto &L : LI) {
  1869. Changed |=
  1870. simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
  1871. }
  1872. if (Changed)
  1873. PDT.recalculate(F);
  1874. LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
  1875. Changed |= LF.fuseLoops(F);
  1876. if (!Changed)
  1877. return PreservedAnalyses::all();
  1878. PreservedAnalyses PA;
  1879. PA.preserve<DominatorTreeAnalysis>();
  1880. PA.preserve<PostDominatorTreeAnalysis>();
  1881. PA.preserve<ScalarEvolutionAnalysis>();
  1882. PA.preserve<LoopAnalysis>();
  1883. return PA;
  1884. }
  1885. char LoopFuseLegacy::ID = 0;
  1886. INITIALIZE_PASS_BEGIN(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false,
  1887. false)
  1888. INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
  1889. INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
  1890. INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
  1891. INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
  1892. INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
  1893. INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
  1894. INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
  1895. INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
  1896. INITIALIZE_PASS_END(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false)
  1897. FunctionPass *llvm::createLoopFusePass() { return new LoopFuseLegacy(); }