SimpleLoopUnswitch.cpp 135 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263
  1. ///===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
  9. #include "llvm/ADT/DenseMap.h"
  10. #include "llvm/ADT/STLExtras.h"
  11. #include "llvm/ADT/Sequence.h"
  12. #include "llvm/ADT/SetVector.h"
  13. #include "llvm/ADT/SmallPtrSet.h"
  14. #include "llvm/ADT/SmallVector.h"
  15. #include "llvm/ADT/Statistic.h"
  16. #include "llvm/ADT/Twine.h"
  17. #include "llvm/Analysis/AssumptionCache.h"
  18. #include "llvm/Analysis/CFG.h"
  19. #include "llvm/Analysis/CodeMetrics.h"
  20. #include "llvm/Analysis/GuardUtils.h"
  21. #include "llvm/Analysis/InstructionSimplify.h"
  22. #include "llvm/Analysis/LoopAnalysisManager.h"
  23. #include "llvm/Analysis/LoopInfo.h"
  24. #include "llvm/Analysis/LoopIterator.h"
  25. #include "llvm/Analysis/LoopPass.h"
  26. #include "llvm/Analysis/MemorySSA.h"
  27. #include "llvm/Analysis/MemorySSAUpdater.h"
  28. #include "llvm/Analysis/MustExecute.h"
  29. #include "llvm/Analysis/ScalarEvolution.h"
  30. #include "llvm/Analysis/ValueTracking.h"
  31. #include "llvm/IR/BasicBlock.h"
  32. #include "llvm/IR/Constant.h"
  33. #include "llvm/IR/Constants.h"
  34. #include "llvm/IR/Dominators.h"
  35. #include "llvm/IR/Function.h"
  36. #include "llvm/IR/IRBuilder.h"
  37. #include "llvm/IR/InstrTypes.h"
  38. #include "llvm/IR/Instruction.h"
  39. #include "llvm/IR/Instructions.h"
  40. #include "llvm/IR/IntrinsicInst.h"
  41. #include "llvm/IR/PatternMatch.h"
  42. #include "llvm/IR/Use.h"
  43. #include "llvm/IR/Value.h"
  44. #include "llvm/InitializePasses.h"
  45. #include "llvm/Pass.h"
  46. #include "llvm/Support/Casting.h"
  47. #include "llvm/Support/CommandLine.h"
  48. #include "llvm/Support/Debug.h"
  49. #include "llvm/Support/ErrorHandling.h"
  50. #include "llvm/Support/GenericDomTree.h"
  51. #include "llvm/Support/raw_ostream.h"
  52. #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  53. #include "llvm/Transforms/Utils/Cloning.h"
  54. #include "llvm/Transforms/Utils/Local.h"
  55. #include "llvm/Transforms/Utils/LoopUtils.h"
  56. #include "llvm/Transforms/Utils/ValueMapper.h"
  57. #include <algorithm>
  58. #include <cassert>
  59. #include <iterator>
  60. #include <numeric>
  61. #include <utility>
  62. #define DEBUG_TYPE "simple-loop-unswitch"
  63. using namespace llvm;
  64. using namespace llvm::PatternMatch;
  65. STATISTIC(NumBranches, "Number of branches unswitched");
  66. STATISTIC(NumSwitches, "Number of switches unswitched");
  67. STATISTIC(NumGuards, "Number of guards turned into branches for unswitching");
  68. STATISTIC(NumTrivial, "Number of unswitches that are trivial");
  69. STATISTIC(
  70. NumCostMultiplierSkipped,
  71. "Number of unswitch candidates that had their cost multiplier skipped");
  72. static cl::opt<bool> EnableNonTrivialUnswitch(
  73. "enable-nontrivial-unswitch", cl::init(false), cl::Hidden,
  74. cl::desc("Forcibly enables non-trivial loop unswitching rather than "
  75. "following the configuration passed into the pass."));
  76. static cl::opt<int>
  77. UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden,
  78. cl::ZeroOrMore,
  79. cl::desc("The cost threshold for unswitching a loop."));
  80. static cl::opt<bool> EnableUnswitchCostMultiplier(
  81. "enable-unswitch-cost-multiplier", cl::init(true), cl::Hidden,
  82. cl::desc("Enable unswitch cost multiplier that prohibits exponential "
  83. "explosion in nontrivial unswitch."));
  84. static cl::opt<int> UnswitchSiblingsToplevelDiv(
  85. "unswitch-siblings-toplevel-div", cl::init(2), cl::Hidden,
  86. cl::desc("Toplevel siblings divisor for cost multiplier."));
  87. static cl::opt<int> UnswitchNumInitialUnscaledCandidates(
  88. "unswitch-num-initial-unscaled-candidates", cl::init(8), cl::Hidden,
  89. cl::desc("Number of unswitch candidates that are ignored when calculating "
  90. "cost multiplier."));
  91. static cl::opt<bool> UnswitchGuards(
  92. "simple-loop-unswitch-guards", cl::init(true), cl::Hidden,
  93. cl::desc("If enabled, simple loop unswitching will also consider "
  94. "llvm.experimental.guard intrinsics as unswitch candidates."));
  95. static cl::opt<bool> DropNonTrivialImplicitNullChecks(
  96. "simple-loop-unswitch-drop-non-trivial-implicit-null-checks",
  97. cl::init(false), cl::Hidden,
  98. cl::desc("If enabled, drop make.implicit metadata in unswitched implicit "
  99. "null checks to save time analyzing if we can keep it."));
  100. static cl::opt<unsigned>
  101. MSSAThreshold("simple-loop-unswitch-memoryssa-threshold",
  102. cl::desc("Max number of memory uses to explore during "
  103. "partial unswitching analysis"),
  104. cl::init(100), cl::Hidden);
  105. static cl::opt<bool> FreezeLoopUnswitchCond(
  106. "freeze-loop-unswitch-cond", cl::init(false), cl::Hidden,
  107. cl::desc("If enabled, the freeze instruction will be added to condition "
  108. "of loop unswitch to prevent miscompilation."));
  109. /// Collect all of the loop invariant input values transitively used by the
  110. /// homogeneous instruction graph from a given root.
  111. ///
  112. /// This essentially walks from a root recursively through loop variant operands
  113. /// which have the exact same opcode and finds all inputs which are loop
  114. /// invariant. For some operations these can be re-associated and unswitched out
  115. /// of the loop entirely.
  116. static TinyPtrVector<Value *>
  117. collectHomogenousInstGraphLoopInvariants(Loop &L, Instruction &Root,
  118. LoopInfo &LI) {
  119. assert(!L.isLoopInvariant(&Root) &&
  120. "Only need to walk the graph if root itself is not invariant.");
  121. TinyPtrVector<Value *> Invariants;
  122. bool IsRootAnd = match(&Root, m_LogicalAnd());
  123. bool IsRootOr = match(&Root, m_LogicalOr());
  124. // Build a worklist and recurse through operators collecting invariants.
  125. SmallVector<Instruction *, 4> Worklist;
  126. SmallPtrSet<Instruction *, 8> Visited;
  127. Worklist.push_back(&Root);
  128. Visited.insert(&Root);
  129. do {
  130. Instruction &I = *Worklist.pop_back_val();
  131. for (Value *OpV : I.operand_values()) {
  132. // Skip constants as unswitching isn't interesting for them.
  133. if (isa<Constant>(OpV))
  134. continue;
  135. // Add it to our result if loop invariant.
  136. if (L.isLoopInvariant(OpV)) {
  137. Invariants.push_back(OpV);
  138. continue;
  139. }
  140. // If not an instruction with the same opcode, nothing we can do.
  141. Instruction *OpI = dyn_cast<Instruction>(OpV);
  142. if (OpI && ((IsRootAnd && match(OpI, m_LogicalAnd())) ||
  143. (IsRootOr && match(OpI, m_LogicalOr())))) {
  144. // Visit this operand.
  145. if (Visited.insert(OpI).second)
  146. Worklist.push_back(OpI);
  147. }
  148. }
  149. } while (!Worklist.empty());
  150. return Invariants;
  151. }
  152. static void replaceLoopInvariantUses(Loop &L, Value *Invariant,
  153. Constant &Replacement) {
  154. assert(!isa<Constant>(Invariant) && "Why are we unswitching on a constant?");
  155. // Replace uses of LIC in the loop with the given constant.
  156. // We use make_early_inc_range as set invalidates the iterator.
  157. for (Use &U : llvm::make_early_inc_range(Invariant->uses())) {
  158. Instruction *UserI = dyn_cast<Instruction>(U.getUser());
  159. // Replace this use within the loop body.
  160. if (UserI && L.contains(UserI))
  161. U.set(&Replacement);
  162. }
  163. }
  164. /// Check that all the LCSSA PHI nodes in the loop exit block have trivial
  165. /// incoming values along this edge.
  166. static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
  167. BasicBlock &ExitBB) {
  168. for (Instruction &I : ExitBB) {
  169. auto *PN = dyn_cast<PHINode>(&I);
  170. if (!PN)
  171. // No more PHIs to check.
  172. return true;
  173. // If the incoming value for this edge isn't loop invariant the unswitch
  174. // won't be trivial.
  175. if (!L.isLoopInvariant(PN->getIncomingValueForBlock(&ExitingBB)))
  176. return false;
  177. }
  178. llvm_unreachable("Basic blocks should never be empty!");
  179. }
  180. /// Copy a set of loop invariant values \p ToDuplicate and insert them at the
  181. /// end of \p BB and conditionally branch on the copied condition. We only
  182. /// branch on a single value.
  183. static void buildPartialUnswitchConditionalBranch(
  184. BasicBlock &BB, ArrayRef<Value *> Invariants, bool Direction,
  185. BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze) {
  186. IRBuilder<> IRB(&BB);
  187. Value *Cond = Direction ? IRB.CreateOr(Invariants) :
  188. IRB.CreateAnd(Invariants);
  189. if (InsertFreeze)
  190. Cond = IRB.CreateFreeze(Cond, Cond->getName() + ".fr");
  191. IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
  192. Direction ? &NormalSucc : &UnswitchedSucc);
  193. }
  194. /// Copy a set of loop invariant values, and conditionally branch on them.
  195. static void buildPartialInvariantUnswitchConditionalBranch(
  196. BasicBlock &BB, ArrayRef<Value *> ToDuplicate, bool Direction,
  197. BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, Loop &L,
  198. MemorySSAUpdater *MSSAU) {
  199. ValueToValueMapTy VMap;
  200. for (auto *Val : reverse(ToDuplicate)) {
  201. Instruction *Inst = cast<Instruction>(Val);
  202. Instruction *NewInst = Inst->clone();
  203. BB.getInstList().insert(BB.end(), NewInst);
  204. RemapInstruction(NewInst, VMap,
  205. RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
  206. VMap[Val] = NewInst;
  207. if (!MSSAU)
  208. continue;
  209. MemorySSA *MSSA = MSSAU->getMemorySSA();
  210. if (auto *MemUse =
  211. dyn_cast_or_null<MemoryUse>(MSSA->getMemoryAccess(Inst))) {
  212. auto *DefiningAccess = MemUse->getDefiningAccess();
  213. // Get the first defining access before the loop.
  214. while (L.contains(DefiningAccess->getBlock())) {
  215. // If the defining access is a MemoryPhi, get the incoming
  216. // value for the pre-header as defining access.
  217. if (auto *MemPhi = dyn_cast<MemoryPhi>(DefiningAccess))
  218. DefiningAccess =
  219. MemPhi->getIncomingValueForBlock(L.getLoopPreheader());
  220. else
  221. DefiningAccess = cast<MemoryDef>(DefiningAccess)->getDefiningAccess();
  222. }
  223. MSSAU->createMemoryAccessInBB(NewInst, DefiningAccess,
  224. NewInst->getParent(),
  225. MemorySSA::BeforeTerminator);
  226. }
  227. }
  228. IRBuilder<> IRB(&BB);
  229. Value *Cond = VMap[ToDuplicate[0]];
  230. IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
  231. Direction ? &NormalSucc : &UnswitchedSucc);
  232. }
  233. /// Rewrite the PHI nodes in an unswitched loop exit basic block.
  234. ///
  235. /// Requires that the loop exit and unswitched basic block are the same, and
  236. /// that the exiting block was a unique predecessor of that block. Rewrites the
  237. /// PHI nodes in that block such that what were LCSSA PHI nodes become trivial
  238. /// PHI nodes from the old preheader that now contains the unswitched
  239. /// terminator.
  240. static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB,
  241. BasicBlock &OldExitingBB,
  242. BasicBlock &OldPH) {
  243. for (PHINode &PN : UnswitchedBB.phis()) {
  244. // When the loop exit is directly unswitched we just need to update the
  245. // incoming basic block. We loop to handle weird cases with repeated
  246. // incoming blocks, but expect to typically only have one operand here.
  247. for (auto i : seq<int>(0, PN.getNumOperands())) {
  248. assert(PN.getIncomingBlock(i) == &OldExitingBB &&
  249. "Found incoming block different from unique predecessor!");
  250. PN.setIncomingBlock(i, &OldPH);
  251. }
  252. }
  253. }
  254. /// Rewrite the PHI nodes in the loop exit basic block and the split off
  255. /// unswitched block.
  256. ///
  257. /// Because the exit block remains an exit from the loop, this rewrites the
  258. /// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI
  259. /// nodes into the unswitched basic block to select between the value in the
  260. /// old preheader and the loop exit.
  261. static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
  262. BasicBlock &UnswitchedBB,
  263. BasicBlock &OldExitingBB,
  264. BasicBlock &OldPH,
  265. bool FullUnswitch) {
  266. assert(&ExitBB != &UnswitchedBB &&
  267. "Must have different loop exit and unswitched blocks!");
  268. Instruction *InsertPt = &*UnswitchedBB.begin();
  269. for (PHINode &PN : ExitBB.phis()) {
  270. auto *NewPN = PHINode::Create(PN.getType(), /*NumReservedValues*/ 2,
  271. PN.getName() + ".split", InsertPt);
  272. // Walk backwards over the old PHI node's inputs to minimize the cost of
  273. // removing each one. We have to do this weird loop manually so that we
  274. // create the same number of new incoming edges in the new PHI as we expect
  275. // each case-based edge to be included in the unswitched switch in some
  276. // cases.
  277. // FIXME: This is really, really gross. It would be much cleaner if LLVM
  278. // allowed us to create a single entry for a predecessor block without
  279. // having separate entries for each "edge" even though these edges are
  280. // required to produce identical results.
  281. for (int i = PN.getNumIncomingValues() - 1; i >= 0; --i) {
  282. if (PN.getIncomingBlock(i) != &OldExitingBB)
  283. continue;
  284. Value *Incoming = PN.getIncomingValue(i);
  285. if (FullUnswitch)
  286. // No more edge from the old exiting block to the exit block.
  287. PN.removeIncomingValue(i);
  288. NewPN->addIncoming(Incoming, &OldPH);
  289. }
  290. // Now replace the old PHI with the new one and wire the old one in as an
  291. // input to the new one.
  292. PN.replaceAllUsesWith(NewPN);
  293. NewPN->addIncoming(&PN, &ExitBB);
  294. }
  295. }
  296. /// Hoist the current loop up to the innermost loop containing a remaining exit.
  297. ///
  298. /// Because we've removed an exit from the loop, we may have changed the set of
  299. /// loops reachable and need to move the current loop up the loop nest or even
  300. /// to an entirely separate nest.
  301. static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader,
  302. DominatorTree &DT, LoopInfo &LI,
  303. MemorySSAUpdater *MSSAU, ScalarEvolution *SE) {
  304. // If the loop is already at the top level, we can't hoist it anywhere.
  305. Loop *OldParentL = L.getParentLoop();
  306. if (!OldParentL)
  307. return;
  308. SmallVector<BasicBlock *, 4> Exits;
  309. L.getExitBlocks(Exits);
  310. Loop *NewParentL = nullptr;
  311. for (auto *ExitBB : Exits)
  312. if (Loop *ExitL = LI.getLoopFor(ExitBB))
  313. if (!NewParentL || NewParentL->contains(ExitL))
  314. NewParentL = ExitL;
  315. if (NewParentL == OldParentL)
  316. return;
  317. // The new parent loop (if different) should always contain the old one.
  318. if (NewParentL)
  319. assert(NewParentL->contains(OldParentL) &&
  320. "Can only hoist this loop up the nest!");
  321. // The preheader will need to move with the body of this loop. However,
  322. // because it isn't in this loop we also need to update the primary loop map.
  323. assert(OldParentL == LI.getLoopFor(&Preheader) &&
  324. "Parent loop of this loop should contain this loop's preheader!");
  325. LI.changeLoopFor(&Preheader, NewParentL);
  326. // Remove this loop from its old parent.
  327. OldParentL->removeChildLoop(&L);
  328. // Add the loop either to the new parent or as a top-level loop.
  329. if (NewParentL)
  330. NewParentL->addChildLoop(&L);
  331. else
  332. LI.addTopLevelLoop(&L);
  333. // Remove this loops blocks from the old parent and every other loop up the
  334. // nest until reaching the new parent. Also update all of these
  335. // no-longer-containing loops to reflect the nesting change.
  336. for (Loop *OldContainingL = OldParentL; OldContainingL != NewParentL;
  337. OldContainingL = OldContainingL->getParentLoop()) {
  338. llvm::erase_if(OldContainingL->getBlocksVector(),
  339. [&](const BasicBlock *BB) {
  340. return BB == &Preheader || L.contains(BB);
  341. });
  342. OldContainingL->getBlocksSet().erase(&Preheader);
  343. for (BasicBlock *BB : L.blocks())
  344. OldContainingL->getBlocksSet().erase(BB);
  345. // Because we just hoisted a loop out of this one, we have essentially
  346. // created new exit paths from it. That means we need to form LCSSA PHI
  347. // nodes for values used in the no-longer-nested loop.
  348. formLCSSA(*OldContainingL, DT, &LI, SE);
  349. // We shouldn't need to form dedicated exits because the exit introduced
  350. // here is the (just split by unswitching) preheader. However, after trivial
  351. // unswitching it is possible to get new non-dedicated exits out of parent
  352. // loop so let's conservatively form dedicated exit blocks and figure out
  353. // if we can optimize later.
  354. formDedicatedExitBlocks(OldContainingL, &DT, &LI, MSSAU,
  355. /*PreserveLCSSA*/ true);
  356. }
  357. }
  358. // Return the top-most loop containing ExitBB and having ExitBB as exiting block
  359. // or the loop containing ExitBB, if there is no parent loop containing ExitBB
  360. // as exiting block.
  361. static Loop *getTopMostExitingLoop(BasicBlock *ExitBB, LoopInfo &LI) {
  362. Loop *TopMost = LI.getLoopFor(ExitBB);
  363. Loop *Current = TopMost;
  364. while (Current) {
  365. if (Current->isLoopExiting(ExitBB))
  366. TopMost = Current;
  367. Current = Current->getParentLoop();
  368. }
  369. return TopMost;
  370. }
  371. /// Unswitch a trivial branch if the condition is loop invariant.
  372. ///
  373. /// This routine should only be called when loop code leading to the branch has
  374. /// been validated as trivial (no side effects). This routine checks if the
  375. /// condition is invariant and one of the successors is a loop exit. This
  376. /// allows us to unswitch without duplicating the loop, making it trivial.
  377. ///
  378. /// If this routine fails to unswitch the branch it returns false.
  379. ///
  380. /// If the branch can be unswitched, this routine splits the preheader and
  381. /// hoists the branch above that split. Preserves loop simplified form
  382. /// (splitting the exit block as necessary). It simplifies the branch within
  383. /// the loop to an unconditional branch but doesn't remove it entirely. Further
  384. /// cleanup can be done with some simplifycfg like pass.
  385. ///
  386. /// If `SE` is not null, it will be updated based on the potential loop SCEVs
  387. /// invalidated by this.
  388. static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
  389. LoopInfo &LI, ScalarEvolution *SE,
  390. MemorySSAUpdater *MSSAU) {
  391. assert(BI.isConditional() && "Can only unswitch a conditional branch!");
  392. LLVM_DEBUG(dbgs() << " Trying to unswitch branch: " << BI << "\n");
  393. // The loop invariant values that we want to unswitch.
  394. TinyPtrVector<Value *> Invariants;
  395. // When true, we're fully unswitching the branch rather than just unswitching
  396. // some input conditions to the branch.
  397. bool FullUnswitch = false;
  398. if (L.isLoopInvariant(BI.getCondition())) {
  399. Invariants.push_back(BI.getCondition());
  400. FullUnswitch = true;
  401. } else {
  402. if (auto *CondInst = dyn_cast<Instruction>(BI.getCondition()))
  403. Invariants = collectHomogenousInstGraphLoopInvariants(L, *CondInst, LI);
  404. if (Invariants.empty()) {
  405. LLVM_DEBUG(dbgs() << " Couldn't find invariant inputs!\n");
  406. return false;
  407. }
  408. }
  409. // Check that one of the branch's successors exits, and which one.
  410. bool ExitDirection = true;
  411. int LoopExitSuccIdx = 0;
  412. auto *LoopExitBB = BI.getSuccessor(0);
  413. if (L.contains(LoopExitBB)) {
  414. ExitDirection = false;
  415. LoopExitSuccIdx = 1;
  416. LoopExitBB = BI.getSuccessor(1);
  417. if (L.contains(LoopExitBB)) {
  418. LLVM_DEBUG(dbgs() << " Branch doesn't exit the loop!\n");
  419. return false;
  420. }
  421. }
  422. auto *ContinueBB = BI.getSuccessor(1 - LoopExitSuccIdx);
  423. auto *ParentBB = BI.getParent();
  424. if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB)) {
  425. LLVM_DEBUG(dbgs() << " Loop exit PHI's aren't loop-invariant!\n");
  426. return false;
  427. }
  428. // When unswitching only part of the branch's condition, we need the exit
  429. // block to be reached directly from the partially unswitched input. This can
  430. // be done when the exit block is along the true edge and the branch condition
  431. // is a graph of `or` operations, or the exit block is along the false edge
  432. // and the condition is a graph of `and` operations.
  433. if (!FullUnswitch) {
  434. if (ExitDirection ? !match(BI.getCondition(), m_LogicalOr())
  435. : !match(BI.getCondition(), m_LogicalAnd())) {
  436. LLVM_DEBUG(dbgs() << " Branch condition is in improper form for "
  437. "non-full unswitch!\n");
  438. return false;
  439. }
  440. }
  441. LLVM_DEBUG({
  442. dbgs() << " unswitching trivial invariant conditions for: " << BI
  443. << "\n";
  444. for (Value *Invariant : Invariants) {
  445. dbgs() << " " << *Invariant << " == true";
  446. if (Invariant != Invariants.back())
  447. dbgs() << " ||";
  448. dbgs() << "\n";
  449. }
  450. });
  451. // If we have scalar evolutions, we need to invalidate them including this
  452. // loop, the loop containing the exit block and the topmost parent loop
  453. // exiting via LoopExitBB.
  454. if (SE) {
  455. if (Loop *ExitL = getTopMostExitingLoop(LoopExitBB, LI))
  456. SE->forgetLoop(ExitL);
  457. else
  458. // Forget the entire nest as this exits the entire nest.
  459. SE->forgetTopmostLoop(&L);
  460. }
  461. if (MSSAU && VerifyMemorySSA)
  462. MSSAU->getMemorySSA()->verifyMemorySSA();
  463. // Split the preheader, so that we know that there is a safe place to insert
  464. // the conditional branch. We will change the preheader to have a conditional
  465. // branch on LoopCond.
  466. BasicBlock *OldPH = L.getLoopPreheader();
  467. BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI, MSSAU);
  468. // Now that we have a place to insert the conditional branch, create a place
  469. // to branch to: this is the exit block out of the loop that we are
  470. // unswitching. We need to split this if there are other loop predecessors.
  471. // Because the loop is in simplified form, *any* other predecessor is enough.
  472. BasicBlock *UnswitchedBB;
  473. if (FullUnswitch && LoopExitBB->getUniquePredecessor()) {
  474. assert(LoopExitBB->getUniquePredecessor() == BI.getParent() &&
  475. "A branch's parent isn't a predecessor!");
  476. UnswitchedBB = LoopExitBB;
  477. } else {
  478. UnswitchedBB =
  479. SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI, MSSAU);
  480. }
  481. if (MSSAU && VerifyMemorySSA)
  482. MSSAU->getMemorySSA()->verifyMemorySSA();
  483. // Actually move the invariant uses into the unswitched position. If possible,
  484. // we do this by moving the instructions, but when doing partial unswitching
  485. // we do it by building a new merge of the values in the unswitched position.
  486. OldPH->getTerminator()->eraseFromParent();
  487. if (FullUnswitch) {
  488. // If fully unswitching, we can use the existing branch instruction.
  489. // Splice it into the old PH to gate reaching the new preheader and re-point
  490. // its successors.
  491. OldPH->getInstList().splice(OldPH->end(), BI.getParent()->getInstList(),
  492. BI);
  493. if (MSSAU) {
  494. // Temporarily clone the terminator, to make MSSA update cheaper by
  495. // separating "insert edge" updates from "remove edge" ones.
  496. ParentBB->getInstList().push_back(BI.clone());
  497. } else {
  498. // Create a new unconditional branch that will continue the loop as a new
  499. // terminator.
  500. BranchInst::Create(ContinueBB, ParentBB);
  501. }
  502. BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB);
  503. BI.setSuccessor(1 - LoopExitSuccIdx, NewPH);
  504. } else {
  505. // Only unswitching a subset of inputs to the condition, so we will need to
  506. // build a new branch that merges the invariant inputs.
  507. if (ExitDirection)
  508. assert(match(BI.getCondition(), m_LogicalOr()) &&
  509. "Must have an `or` of `i1`s or `select i1 X, true, Y`s for the "
  510. "condition!");
  511. else
  512. assert(match(BI.getCondition(), m_LogicalAnd()) &&
  513. "Must have an `and` of `i1`s or `select i1 X, Y, false`s for the"
  514. " condition!");
  515. buildPartialUnswitchConditionalBranch(*OldPH, Invariants, ExitDirection,
  516. *UnswitchedBB, *NewPH, false);
  517. }
  518. // Update the dominator tree with the added edge.
  519. DT.insertEdge(OldPH, UnswitchedBB);
  520. // After the dominator tree was updated with the added edge, update MemorySSA
  521. // if available.
  522. if (MSSAU) {
  523. SmallVector<CFGUpdate, 1> Updates;
  524. Updates.push_back({cfg::UpdateKind::Insert, OldPH, UnswitchedBB});
  525. MSSAU->applyInsertUpdates(Updates, DT);
  526. }
  527. // Finish updating dominator tree and memory ssa for full unswitch.
  528. if (FullUnswitch) {
  529. if (MSSAU) {
  530. // Remove the cloned branch instruction.
  531. ParentBB->getTerminator()->eraseFromParent();
  532. // Create unconditional branch now.
  533. BranchInst::Create(ContinueBB, ParentBB);
  534. MSSAU->removeEdge(ParentBB, LoopExitBB);
  535. }
  536. DT.deleteEdge(ParentBB, LoopExitBB);
  537. }
  538. if (MSSAU && VerifyMemorySSA)
  539. MSSAU->getMemorySSA()->verifyMemorySSA();
  540. // Rewrite the relevant PHI nodes.
  541. if (UnswitchedBB == LoopExitBB)
  542. rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH);
  543. else
  544. rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB,
  545. *ParentBB, *OldPH, FullUnswitch);
  546. // The constant we can replace all of our invariants with inside the loop
  547. // body. If any of the invariants have a value other than this the loop won't
  548. // be entered.
  549. ConstantInt *Replacement = ExitDirection
  550. ? ConstantInt::getFalse(BI.getContext())
  551. : ConstantInt::getTrue(BI.getContext());
  552. // Since this is an i1 condition we can also trivially replace uses of it
  553. // within the loop with a constant.
  554. for (Value *Invariant : Invariants)
  555. replaceLoopInvariantUses(L, Invariant, *Replacement);
  556. // If this was full unswitching, we may have changed the nesting relationship
  557. // for this loop so hoist it to its correct parent if needed.
  558. if (FullUnswitch)
  559. hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU, SE);
  560. if (MSSAU && VerifyMemorySSA)
  561. MSSAU->getMemorySSA()->verifyMemorySSA();
  562. LLVM_DEBUG(dbgs() << " done: unswitching trivial branch...\n");
  563. ++NumTrivial;
  564. ++NumBranches;
  565. return true;
  566. }
  567. /// Unswitch a trivial switch if the condition is loop invariant.
  568. ///
  569. /// This routine should only be called when loop code leading to the switch has
  570. /// been validated as trivial (no side effects). This routine checks if the
  571. /// condition is invariant and that at least one of the successors is a loop
  572. /// exit. This allows us to unswitch without duplicating the loop, making it
  573. /// trivial.
  574. ///
  575. /// If this routine fails to unswitch the switch it returns false.
  576. ///
  577. /// If the switch can be unswitched, this routine splits the preheader and
  578. /// copies the switch above that split. If the default case is one of the
  579. /// exiting cases, it copies the non-exiting cases and points them at the new
  580. /// preheader. If the default case is not exiting, it copies the exiting cases
  581. /// and points the default at the preheader. It preserves loop simplified form
  582. /// (splitting the exit blocks as necessary). It simplifies the switch within
  583. /// the loop by removing now-dead cases. If the default case is one of those
  584. /// unswitched, it replaces its destination with a new basic block containing
  585. /// only unreachable. Such basic blocks, while technically loop exits, are not
  586. /// considered for unswitching so this is a stable transform and the same
  587. /// switch will not be revisited. If after unswitching there is only a single
  588. /// in-loop successor, the switch is further simplified to an unconditional
  589. /// branch. Still more cleanup can be done with some simplifycfg like pass.
  590. ///
  591. /// If `SE` is not null, it will be updated based on the potential loop SCEVs
  592. /// invalidated by this.
  593. static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
  594. LoopInfo &LI, ScalarEvolution *SE,
  595. MemorySSAUpdater *MSSAU) {
  596. LLVM_DEBUG(dbgs() << " Trying to unswitch switch: " << SI << "\n");
  597. Value *LoopCond = SI.getCondition();
  598. // If this isn't switching on an invariant condition, we can't unswitch it.
  599. if (!L.isLoopInvariant(LoopCond))
  600. return false;
  601. auto *ParentBB = SI.getParent();
  602. // The same check must be used both for the default and the exit cases. We
  603. // should never leave edges from the switch instruction to a basic block that
  604. // we are unswitching, hence the condition used to determine the default case
  605. // needs to also be used to populate ExitCaseIndices, which is then used to
  606. // remove cases from the switch.
  607. auto IsTriviallyUnswitchableExitBlock = [&](BasicBlock &BBToCheck) {
  608. // BBToCheck is not an exit block if it is inside loop L.
  609. if (L.contains(&BBToCheck))
  610. return false;
  611. // BBToCheck is not trivial to unswitch if its phis aren't loop invariant.
  612. if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, BBToCheck))
  613. return false;
  614. // We do not unswitch a block that only has an unreachable statement, as
  615. // it's possible this is a previously unswitched block. Only unswitch if
  616. // either the terminator is not unreachable, or, if it is, it's not the only
  617. // instruction in the block.
  618. auto *TI = BBToCheck.getTerminator();
  619. bool isUnreachable = isa<UnreachableInst>(TI);
  620. return !isUnreachable ||
  621. (isUnreachable && (BBToCheck.getFirstNonPHIOrDbg() != TI));
  622. };
  623. SmallVector<int, 4> ExitCaseIndices;
  624. for (auto Case : SI.cases())
  625. if (IsTriviallyUnswitchableExitBlock(*Case.getCaseSuccessor()))
  626. ExitCaseIndices.push_back(Case.getCaseIndex());
  627. BasicBlock *DefaultExitBB = nullptr;
  628. SwitchInstProfUpdateWrapper::CaseWeightOpt DefaultCaseWeight =
  629. SwitchInstProfUpdateWrapper::getSuccessorWeight(SI, 0);
  630. if (IsTriviallyUnswitchableExitBlock(*SI.getDefaultDest())) {
  631. DefaultExitBB = SI.getDefaultDest();
  632. } else if (ExitCaseIndices.empty())
  633. return false;
  634. LLVM_DEBUG(dbgs() << " unswitching trivial switch...\n");
  635. if (MSSAU && VerifyMemorySSA)
  636. MSSAU->getMemorySSA()->verifyMemorySSA();
  637. // We may need to invalidate SCEVs for the outermost loop reached by any of
  638. // the exits.
  639. Loop *OuterL = &L;
  640. if (DefaultExitBB) {
  641. // Clear out the default destination temporarily to allow accurate
  642. // predecessor lists to be examined below.
  643. SI.setDefaultDest(nullptr);
  644. // Check the loop containing this exit.
  645. Loop *ExitL = LI.getLoopFor(DefaultExitBB);
  646. if (!ExitL || ExitL->contains(OuterL))
  647. OuterL = ExitL;
  648. }
  649. // Store the exit cases into a separate data structure and remove them from
  650. // the switch.
  651. SmallVector<std::tuple<ConstantInt *, BasicBlock *,
  652. SwitchInstProfUpdateWrapper::CaseWeightOpt>,
  653. 4> ExitCases;
  654. ExitCases.reserve(ExitCaseIndices.size());
  655. SwitchInstProfUpdateWrapper SIW(SI);
  656. // We walk the case indices backwards so that we remove the last case first
  657. // and don't disrupt the earlier indices.
  658. for (unsigned Index : reverse(ExitCaseIndices)) {
  659. auto CaseI = SI.case_begin() + Index;
  660. // Compute the outer loop from this exit.
  661. Loop *ExitL = LI.getLoopFor(CaseI->getCaseSuccessor());
  662. if (!ExitL || ExitL->contains(OuterL))
  663. OuterL = ExitL;
  664. // Save the value of this case.
  665. auto W = SIW.getSuccessorWeight(CaseI->getSuccessorIndex());
  666. ExitCases.emplace_back(CaseI->getCaseValue(), CaseI->getCaseSuccessor(), W);
  667. // Delete the unswitched cases.
  668. SIW.removeCase(CaseI);
  669. }
  670. if (SE) {
  671. if (OuterL)
  672. SE->forgetLoop(OuterL);
  673. else
  674. SE->forgetTopmostLoop(&L);
  675. }
  676. // Check if after this all of the remaining cases point at the same
  677. // successor.
  678. BasicBlock *CommonSuccBB = nullptr;
  679. if (SI.getNumCases() > 0 &&
  680. all_of(drop_begin(SI.cases()), [&SI](const SwitchInst::CaseHandle &Case) {
  681. return Case.getCaseSuccessor() == SI.case_begin()->getCaseSuccessor();
  682. }))
  683. CommonSuccBB = SI.case_begin()->getCaseSuccessor();
  684. if (!DefaultExitBB) {
  685. // If we're not unswitching the default, we need it to match any cases to
  686. // have a common successor or if we have no cases it is the common
  687. // successor.
  688. if (SI.getNumCases() == 0)
  689. CommonSuccBB = SI.getDefaultDest();
  690. else if (SI.getDefaultDest() != CommonSuccBB)
  691. CommonSuccBB = nullptr;
  692. }
  693. // Split the preheader, so that we know that there is a safe place to insert
  694. // the switch.
  695. BasicBlock *OldPH = L.getLoopPreheader();
  696. BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI, MSSAU);
  697. OldPH->getTerminator()->eraseFromParent();
  698. // Now add the unswitched switch.
  699. auto *NewSI = SwitchInst::Create(LoopCond, NewPH, ExitCases.size(), OldPH);
  700. SwitchInstProfUpdateWrapper NewSIW(*NewSI);
  701. // Rewrite the IR for the unswitched basic blocks. This requires two steps.
  702. // First, we split any exit blocks with remaining in-loop predecessors. Then
  703. // we update the PHIs in one of two ways depending on if there was a split.
  704. // We walk in reverse so that we split in the same order as the cases
  705. // appeared. This is purely for convenience of reading the resulting IR, but
  706. // it doesn't cost anything really.
  707. SmallPtrSet<BasicBlock *, 2> UnswitchedExitBBs;
  708. SmallDenseMap<BasicBlock *, BasicBlock *, 2> SplitExitBBMap;
  709. // Handle the default exit if necessary.
  710. // FIXME: It'd be great if we could merge this with the loop below but LLVM's
  711. // ranges aren't quite powerful enough yet.
  712. if (DefaultExitBB) {
  713. if (pred_empty(DefaultExitBB)) {
  714. UnswitchedExitBBs.insert(DefaultExitBB);
  715. rewritePHINodesForUnswitchedExitBlock(*DefaultExitBB, *ParentBB, *OldPH);
  716. } else {
  717. auto *SplitBB =
  718. SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI, MSSAU);
  719. rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB,
  720. *ParentBB, *OldPH,
  721. /*FullUnswitch*/ true);
  722. DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
  723. }
  724. }
  725. // Note that we must use a reference in the for loop so that we update the
  726. // container.
  727. for (auto &ExitCase : reverse(ExitCases)) {
  728. // Grab a reference to the exit block in the pair so that we can update it.
  729. BasicBlock *ExitBB = std::get<1>(ExitCase);
  730. // If this case is the last edge into the exit block, we can simply reuse it
  731. // as it will no longer be a loop exit. No mapping necessary.
  732. if (pred_empty(ExitBB)) {
  733. // Only rewrite once.
  734. if (UnswitchedExitBBs.insert(ExitBB).second)
  735. rewritePHINodesForUnswitchedExitBlock(*ExitBB, *ParentBB, *OldPH);
  736. continue;
  737. }
  738. // Otherwise we need to split the exit block so that we retain an exit
  739. // block from the loop and a target for the unswitched condition.
  740. BasicBlock *&SplitExitBB = SplitExitBBMap[ExitBB];
  741. if (!SplitExitBB) {
  742. // If this is the first time we see this, do the split and remember it.
  743. SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI, MSSAU);
  744. rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB,
  745. *ParentBB, *OldPH,
  746. /*FullUnswitch*/ true);
  747. }
  748. // Update the case pair to point to the split block.
  749. std::get<1>(ExitCase) = SplitExitBB;
  750. }
  751. // Now add the unswitched cases. We do this in reverse order as we built them
  752. // in reverse order.
  753. for (auto &ExitCase : reverse(ExitCases)) {
  754. ConstantInt *CaseVal = std::get<0>(ExitCase);
  755. BasicBlock *UnswitchedBB = std::get<1>(ExitCase);
  756. NewSIW.addCase(CaseVal, UnswitchedBB, std::get<2>(ExitCase));
  757. }
  758. // If the default was unswitched, re-point it and add explicit cases for
  759. // entering the loop.
  760. if (DefaultExitBB) {
  761. NewSIW->setDefaultDest(DefaultExitBB);
  762. NewSIW.setSuccessorWeight(0, DefaultCaseWeight);
  763. // We removed all the exit cases, so we just copy the cases to the
  764. // unswitched switch.
  765. for (const auto &Case : SI.cases())
  766. NewSIW.addCase(Case.getCaseValue(), NewPH,
  767. SIW.getSuccessorWeight(Case.getSuccessorIndex()));
  768. } else if (DefaultCaseWeight) {
  769. // We have to set branch weight of the default case.
  770. uint64_t SW = *DefaultCaseWeight;
  771. for (const auto &Case : SI.cases()) {
  772. auto W = SIW.getSuccessorWeight(Case.getSuccessorIndex());
  773. assert(W &&
  774. "case weight must be defined as default case weight is defined");
  775. SW += *W;
  776. }
  777. NewSIW.setSuccessorWeight(0, SW);
  778. }
  779. // If we ended up with a common successor for every path through the switch
  780. // after unswitching, rewrite it to an unconditional branch to make it easy
  781. // to recognize. Otherwise we potentially have to recognize the default case
  782. // pointing at unreachable and other complexity.
  783. if (CommonSuccBB) {
  784. BasicBlock *BB = SI.getParent();
  785. // We may have had multiple edges to this common successor block, so remove
  786. // them as predecessors. We skip the first one, either the default or the
  787. // actual first case.
  788. bool SkippedFirst = DefaultExitBB == nullptr;
  789. for (auto Case : SI.cases()) {
  790. assert(Case.getCaseSuccessor() == CommonSuccBB &&
  791. "Non-common successor!");
  792. (void)Case;
  793. if (!SkippedFirst) {
  794. SkippedFirst = true;
  795. continue;
  796. }
  797. CommonSuccBB->removePredecessor(BB,
  798. /*KeepOneInputPHIs*/ true);
  799. }
  800. // Now nuke the switch and replace it with a direct branch.
  801. SIW.eraseFromParent();
  802. BranchInst::Create(CommonSuccBB, BB);
  803. } else if (DefaultExitBB) {
  804. assert(SI.getNumCases() > 0 &&
  805. "If we had no cases we'd have a common successor!");
  806. // Move the last case to the default successor. This is valid as if the
  807. // default got unswitched it cannot be reached. This has the advantage of
  808. // being simple and keeping the number of edges from this switch to
  809. // successors the same, and avoiding any PHI update complexity.
  810. auto LastCaseI = std::prev(SI.case_end());
  811. SI.setDefaultDest(LastCaseI->getCaseSuccessor());
  812. SIW.setSuccessorWeight(
  813. 0, SIW.getSuccessorWeight(LastCaseI->getSuccessorIndex()));
  814. SIW.removeCase(LastCaseI);
  815. }
  816. // Walk the unswitched exit blocks and the unswitched split blocks and update
  817. // the dominator tree based on the CFG edits. While we are walking unordered
  818. // containers here, the API for applyUpdates takes an unordered list of
  819. // updates and requires them to not contain duplicates.
  820. SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
  821. for (auto *UnswitchedExitBB : UnswitchedExitBBs) {
  822. DTUpdates.push_back({DT.Delete, ParentBB, UnswitchedExitBB});
  823. DTUpdates.push_back({DT.Insert, OldPH, UnswitchedExitBB});
  824. }
  825. for (auto SplitUnswitchedPair : SplitExitBBMap) {
  826. DTUpdates.push_back({DT.Delete, ParentBB, SplitUnswitchedPair.first});
  827. DTUpdates.push_back({DT.Insert, OldPH, SplitUnswitchedPair.second});
  828. }
  829. if (MSSAU) {
  830. MSSAU->applyUpdates(DTUpdates, DT, /*UpdateDT=*/true);
  831. if (VerifyMemorySSA)
  832. MSSAU->getMemorySSA()->verifyMemorySSA();
  833. } else {
  834. DT.applyUpdates(DTUpdates);
  835. }
  836. assert(DT.verify(DominatorTree::VerificationLevel::Fast));
  837. // We may have changed the nesting relationship for this loop so hoist it to
  838. // its correct parent if needed.
  839. hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU, SE);
  840. if (MSSAU && VerifyMemorySSA)
  841. MSSAU->getMemorySSA()->verifyMemorySSA();
  842. ++NumTrivial;
  843. ++NumSwitches;
  844. LLVM_DEBUG(dbgs() << " done: unswitching trivial switch...\n");
  845. return true;
  846. }
  847. /// This routine scans the loop to find a branch or switch which occurs before
  848. /// any side effects occur. These can potentially be unswitched without
  849. /// duplicating the loop. If a branch or switch is successfully unswitched the
  850. /// scanning continues to see if subsequent branches or switches have become
  851. /// trivial. Once all trivial candidates have been unswitched, this routine
  852. /// returns.
  853. ///
  854. /// The return value indicates whether anything was unswitched (and therefore
  855. /// changed).
  856. ///
  857. /// If `SE` is not null, it will be updated based on the potential loop SCEVs
  858. /// invalidated by this.
  859. static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
  860. LoopInfo &LI, ScalarEvolution *SE,
  861. MemorySSAUpdater *MSSAU) {
  862. bool Changed = false;
  863. // If loop header has only one reachable successor we should keep looking for
  864. // trivial condition candidates in the successor as well. An alternative is
  865. // to constant fold conditions and merge successors into loop header (then we
  866. // only need to check header's terminator). The reason for not doing this in
  867. // LoopUnswitch pass is that it could potentially break LoopPassManager's
  868. // invariants. Folding dead branches could either eliminate the current loop
  869. // or make other loops unreachable. LCSSA form might also not be preserved
  870. // after deleting branches. The following code keeps traversing loop header's
  871. // successors until it finds the trivial condition candidate (condition that
  872. // is not a constant). Since unswitching generates branches with constant
  873. // conditions, this scenario could be very common in practice.
  874. BasicBlock *CurrentBB = L.getHeader();
  875. SmallPtrSet<BasicBlock *, 8> Visited;
  876. Visited.insert(CurrentBB);
  877. do {
  878. // Check if there are any side-effecting instructions (e.g. stores, calls,
  879. // volatile loads) in the part of the loop that the code *would* execute
  880. // without unswitching.
  881. if (MSSAU) // Possible early exit with MSSA
  882. if (auto *Defs = MSSAU->getMemorySSA()->getBlockDefs(CurrentBB))
  883. if (!isa<MemoryPhi>(*Defs->begin()) || (++Defs->begin() != Defs->end()))
  884. return Changed;
  885. if (llvm::any_of(*CurrentBB,
  886. [](Instruction &I) { return I.mayHaveSideEffects(); }))
  887. return Changed;
  888. Instruction *CurrentTerm = CurrentBB->getTerminator();
  889. if (auto *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
  890. // Don't bother trying to unswitch past a switch with a constant
  891. // condition. This should be removed prior to running this pass by
  892. // simplifycfg.
  893. if (isa<Constant>(SI->getCondition()))
  894. return Changed;
  895. if (!unswitchTrivialSwitch(L, *SI, DT, LI, SE, MSSAU))
  896. // Couldn't unswitch this one so we're done.
  897. return Changed;
  898. // Mark that we managed to unswitch something.
  899. Changed = true;
  900. // If unswitching turned the terminator into an unconditional branch then
  901. // we can continue. The unswitching logic specifically works to fold any
  902. // cases it can into an unconditional branch to make it easier to
  903. // recognize here.
  904. auto *BI = dyn_cast<BranchInst>(CurrentBB->getTerminator());
  905. if (!BI || BI->isConditional())
  906. return Changed;
  907. CurrentBB = BI->getSuccessor(0);
  908. continue;
  909. }
  910. auto *BI = dyn_cast<BranchInst>(CurrentTerm);
  911. if (!BI)
  912. // We do not understand other terminator instructions.
  913. return Changed;
  914. // Don't bother trying to unswitch past an unconditional branch or a branch
  915. // with a constant value. These should be removed by simplifycfg prior to
  916. // running this pass.
  917. if (!BI->isConditional() || isa<Constant>(BI->getCondition()))
  918. return Changed;
  919. // Found a trivial condition candidate: non-foldable conditional branch. If
  920. // we fail to unswitch this, we can't do anything else that is trivial.
  921. if (!unswitchTrivialBranch(L, *BI, DT, LI, SE, MSSAU))
  922. return Changed;
  923. // Mark that we managed to unswitch something.
  924. Changed = true;
  925. // If we only unswitched some of the conditions feeding the branch, we won't
  926. // have collapsed it to a single successor.
  927. BI = cast<BranchInst>(CurrentBB->getTerminator());
  928. if (BI->isConditional())
  929. return Changed;
  930. // Follow the newly unconditional branch into its successor.
  931. CurrentBB = BI->getSuccessor(0);
  932. // When continuing, if we exit the loop or reach a previous visited block,
  933. // then we can not reach any trivial condition candidates (unfoldable
  934. // branch instructions or switch instructions) and no unswitch can happen.
  935. } while (L.contains(CurrentBB) && Visited.insert(CurrentBB).second);
  936. return Changed;
  937. }
  938. /// Build the cloned blocks for an unswitched copy of the given loop.
  939. ///
  940. /// The cloned blocks are inserted before the loop preheader (`LoopPH`) and
  941. /// after the split block (`SplitBB`) that will be used to select between the
  942. /// cloned and original loop.
  943. ///
  944. /// This routine handles cloning all of the necessary loop blocks and exit
  945. /// blocks including rewriting their instructions and the relevant PHI nodes.
  946. /// Any loop blocks or exit blocks which are dominated by a different successor
  947. /// than the one for this clone of the loop blocks can be trivially skipped. We
  948. /// use the `DominatingSucc` map to determine whether a block satisfies that
  949. /// property with a simple map lookup.
  950. ///
  951. /// It also correctly creates the unconditional branch in the cloned
  952. /// unswitched parent block to only point at the unswitched successor.
  953. ///
  954. /// This does not handle most of the necessary updates to `LoopInfo`. Only exit
  955. /// block splitting is correctly reflected in `LoopInfo`, essentially all of
  956. /// the cloned blocks (and their loops) are left without full `LoopInfo`
  957. /// updates. This also doesn't fully update `DominatorTree`. It adds the cloned
  958. /// blocks to them but doesn't create the cloned `DominatorTree` structure and
  959. /// instead the caller must recompute an accurate DT. It *does* correctly
  960. /// update the `AssumptionCache` provided in `AC`.
  961. static BasicBlock *buildClonedLoopBlocks(
  962. Loop &L, BasicBlock *LoopPH, BasicBlock *SplitBB,
  963. ArrayRef<BasicBlock *> ExitBlocks, BasicBlock *ParentBB,
  964. BasicBlock *UnswitchedSuccBB, BasicBlock *ContinueSuccBB,
  965. const SmallDenseMap<BasicBlock *, BasicBlock *, 16> &DominatingSucc,
  966. ValueToValueMapTy &VMap,
  967. SmallVectorImpl<DominatorTree::UpdateType> &DTUpdates, AssumptionCache &AC,
  968. DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU) {
  969. SmallVector<BasicBlock *, 4> NewBlocks;
  970. NewBlocks.reserve(L.getNumBlocks() + ExitBlocks.size());
  971. // We will need to clone a bunch of blocks, wrap up the clone operation in
  972. // a helper.
  973. auto CloneBlock = [&](BasicBlock *OldBB) {
  974. // Clone the basic block and insert it before the new preheader.
  975. BasicBlock *NewBB = CloneBasicBlock(OldBB, VMap, ".us", OldBB->getParent());
  976. NewBB->moveBefore(LoopPH);
  977. // Record this block and the mapping.
  978. NewBlocks.push_back(NewBB);
  979. VMap[OldBB] = NewBB;
  980. return NewBB;
  981. };
  982. // We skip cloning blocks when they have a dominating succ that is not the
  983. // succ we are cloning for.
  984. auto SkipBlock = [&](BasicBlock *BB) {
  985. auto It = DominatingSucc.find(BB);
  986. return It != DominatingSucc.end() && It->second != UnswitchedSuccBB;
  987. };
  988. // First, clone the preheader.
  989. auto *ClonedPH = CloneBlock(LoopPH);
  990. // Then clone all the loop blocks, skipping the ones that aren't necessary.
  991. for (auto *LoopBB : L.blocks())
  992. if (!SkipBlock(LoopBB))
  993. CloneBlock(LoopBB);
  994. // Split all the loop exit edges so that when we clone the exit blocks, if
  995. // any of the exit blocks are *also* a preheader for some other loop, we
  996. // don't create multiple predecessors entering the loop header.
  997. for (auto *ExitBB : ExitBlocks) {
  998. if (SkipBlock(ExitBB))
  999. continue;
  1000. // When we are going to clone an exit, we don't need to clone all the
  1001. // instructions in the exit block and we want to ensure we have an easy
  1002. // place to merge the CFG, so split the exit first. This is always safe to
  1003. // do because there cannot be any non-loop predecessors of a loop exit in
  1004. // loop simplified form.
  1005. auto *MergeBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI, MSSAU);
  1006. // Rearrange the names to make it easier to write test cases by having the
  1007. // exit block carry the suffix rather than the merge block carrying the
  1008. // suffix.
  1009. MergeBB->takeName(ExitBB);
  1010. ExitBB->setName(Twine(MergeBB->getName()) + ".split");
  1011. // Now clone the original exit block.
  1012. auto *ClonedExitBB = CloneBlock(ExitBB);
  1013. assert(ClonedExitBB->getTerminator()->getNumSuccessors() == 1 &&
  1014. "Exit block should have been split to have one successor!");
  1015. assert(ClonedExitBB->getTerminator()->getSuccessor(0) == MergeBB &&
  1016. "Cloned exit block has the wrong successor!");
  1017. // Remap any cloned instructions and create a merge phi node for them.
  1018. for (auto ZippedInsts : llvm::zip_first(
  1019. llvm::make_range(ExitBB->begin(), std::prev(ExitBB->end())),
  1020. llvm::make_range(ClonedExitBB->begin(),
  1021. std::prev(ClonedExitBB->end())))) {
  1022. Instruction &I = std::get<0>(ZippedInsts);
  1023. Instruction &ClonedI = std::get<1>(ZippedInsts);
  1024. // The only instructions in the exit block should be PHI nodes and
  1025. // potentially a landing pad.
  1026. assert(
  1027. (isa<PHINode>(I) || isa<LandingPadInst>(I) || isa<CatchPadInst>(I)) &&
  1028. "Bad instruction in exit block!");
  1029. // We should have a value map between the instruction and its clone.
  1030. assert(VMap.lookup(&I) == &ClonedI && "Mismatch in the value map!");
  1031. auto *MergePN =
  1032. PHINode::Create(I.getType(), /*NumReservedValues*/ 2, ".us-phi",
  1033. &*MergeBB->getFirstInsertionPt());
  1034. I.replaceAllUsesWith(MergePN);
  1035. MergePN->addIncoming(&I, ExitBB);
  1036. MergePN->addIncoming(&ClonedI, ClonedExitBB);
  1037. }
  1038. }
  1039. // Rewrite the instructions in the cloned blocks to refer to the instructions
  1040. // in the cloned blocks. We have to do this as a second pass so that we have
  1041. // everything available. Also, we have inserted new instructions which may
  1042. // include assume intrinsics, so we update the assumption cache while
  1043. // processing this.
  1044. for (auto *ClonedBB : NewBlocks)
  1045. for (Instruction &I : *ClonedBB) {
  1046. RemapInstruction(&I, VMap,
  1047. RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
  1048. if (auto *II = dyn_cast<AssumeInst>(&I))
  1049. AC.registerAssumption(II);
  1050. }
  1051. // Update any PHI nodes in the cloned successors of the skipped blocks to not
  1052. // have spurious incoming values.
  1053. for (auto *LoopBB : L.blocks())
  1054. if (SkipBlock(LoopBB))
  1055. for (auto *SuccBB : successors(LoopBB))
  1056. if (auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB)))
  1057. for (PHINode &PN : ClonedSuccBB->phis())
  1058. PN.removeIncomingValue(LoopBB, /*DeletePHIIfEmpty*/ false);
  1059. // Remove the cloned parent as a predecessor of any successor we ended up
  1060. // cloning other than the unswitched one.
  1061. auto *ClonedParentBB = cast<BasicBlock>(VMap.lookup(ParentBB));
  1062. for (auto *SuccBB : successors(ParentBB)) {
  1063. if (SuccBB == UnswitchedSuccBB)
  1064. continue;
  1065. auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB));
  1066. if (!ClonedSuccBB)
  1067. continue;
  1068. ClonedSuccBB->removePredecessor(ClonedParentBB,
  1069. /*KeepOneInputPHIs*/ true);
  1070. }
  1071. // Replace the cloned branch with an unconditional branch to the cloned
  1072. // unswitched successor.
  1073. auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB));
  1074. Instruction *ClonedTerminator = ClonedParentBB->getTerminator();
  1075. // Trivial Simplification. If Terminator is a conditional branch and
  1076. // condition becomes dead - erase it.
  1077. Value *ClonedConditionToErase = nullptr;
  1078. if (auto *BI = dyn_cast<BranchInst>(ClonedTerminator))
  1079. ClonedConditionToErase = BI->getCondition();
  1080. else if (auto *SI = dyn_cast<SwitchInst>(ClonedTerminator))
  1081. ClonedConditionToErase = SI->getCondition();
  1082. ClonedTerminator->eraseFromParent();
  1083. BranchInst::Create(ClonedSuccBB, ClonedParentBB);
  1084. if (ClonedConditionToErase)
  1085. RecursivelyDeleteTriviallyDeadInstructions(ClonedConditionToErase, nullptr,
  1086. MSSAU);
  1087. // If there are duplicate entries in the PHI nodes because of multiple edges
  1088. // to the unswitched successor, we need to nuke all but one as we replaced it
  1089. // with a direct branch.
  1090. for (PHINode &PN : ClonedSuccBB->phis()) {
  1091. bool Found = false;
  1092. // Loop over the incoming operands backwards so we can easily delete as we
  1093. // go without invalidating the index.
  1094. for (int i = PN.getNumOperands() - 1; i >= 0; --i) {
  1095. if (PN.getIncomingBlock(i) != ClonedParentBB)
  1096. continue;
  1097. if (!Found) {
  1098. Found = true;
  1099. continue;
  1100. }
  1101. PN.removeIncomingValue(i, /*DeletePHIIfEmpty*/ false);
  1102. }
  1103. }
  1104. // Record the domtree updates for the new blocks.
  1105. SmallPtrSet<BasicBlock *, 4> SuccSet;
  1106. for (auto *ClonedBB : NewBlocks) {
  1107. for (auto *SuccBB : successors(ClonedBB))
  1108. if (SuccSet.insert(SuccBB).second)
  1109. DTUpdates.push_back({DominatorTree::Insert, ClonedBB, SuccBB});
  1110. SuccSet.clear();
  1111. }
  1112. return ClonedPH;
  1113. }
  1114. /// Recursively clone the specified loop and all of its children.
  1115. ///
  1116. /// The target parent loop for the clone should be provided, or can be null if
  1117. /// the clone is a top-level loop. While cloning, all the blocks are mapped
  1118. /// with the provided value map. The entire original loop must be present in
  1119. /// the value map. The cloned loop is returned.
  1120. static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL,
  1121. const ValueToValueMapTy &VMap, LoopInfo &LI) {
  1122. auto AddClonedBlocksToLoop = [&](Loop &OrigL, Loop &ClonedL) {
  1123. assert(ClonedL.getBlocks().empty() && "Must start with an empty loop!");
  1124. ClonedL.reserveBlocks(OrigL.getNumBlocks());
  1125. for (auto *BB : OrigL.blocks()) {
  1126. auto *ClonedBB = cast<BasicBlock>(VMap.lookup(BB));
  1127. ClonedL.addBlockEntry(ClonedBB);
  1128. if (LI.getLoopFor(BB) == &OrigL)
  1129. LI.changeLoopFor(ClonedBB, &ClonedL);
  1130. }
  1131. };
  1132. // We specially handle the first loop because it may get cloned into
  1133. // a different parent and because we most commonly are cloning leaf loops.
  1134. Loop *ClonedRootL = LI.AllocateLoop();
  1135. if (RootParentL)
  1136. RootParentL->addChildLoop(ClonedRootL);
  1137. else
  1138. LI.addTopLevelLoop(ClonedRootL);
  1139. AddClonedBlocksToLoop(OrigRootL, *ClonedRootL);
  1140. if (OrigRootL.isInnermost())
  1141. return ClonedRootL;
  1142. // If we have a nest, we can quickly clone the entire loop nest using an
  1143. // iterative approach because it is a tree. We keep the cloned parent in the
  1144. // data structure to avoid repeatedly querying through a map to find it.
  1145. SmallVector<std::pair<Loop *, Loop *>, 16> LoopsToClone;
  1146. // Build up the loops to clone in reverse order as we'll clone them from the
  1147. // back.
  1148. for (Loop *ChildL : llvm::reverse(OrigRootL))
  1149. LoopsToClone.push_back({ClonedRootL, ChildL});
  1150. do {
  1151. Loop *ClonedParentL, *L;
  1152. std::tie(ClonedParentL, L) = LoopsToClone.pop_back_val();
  1153. Loop *ClonedL = LI.AllocateLoop();
  1154. ClonedParentL->addChildLoop(ClonedL);
  1155. AddClonedBlocksToLoop(*L, *ClonedL);
  1156. for (Loop *ChildL : llvm::reverse(*L))
  1157. LoopsToClone.push_back({ClonedL, ChildL});
  1158. } while (!LoopsToClone.empty());
  1159. return ClonedRootL;
  1160. }
  1161. /// Build the cloned loops of an original loop from unswitching.
  1162. ///
  1163. /// Because unswitching simplifies the CFG of the loop, this isn't a trivial
  1164. /// operation. We need to re-verify that there even is a loop (as the backedge
  1165. /// may not have been cloned), and even if there are remaining backedges the
  1166. /// backedge set may be different. However, we know that each child loop is
  1167. /// undisturbed, we only need to find where to place each child loop within
  1168. /// either any parent loop or within a cloned version of the original loop.
  1169. ///
  1170. /// Because child loops may end up cloned outside of any cloned version of the
  1171. /// original loop, multiple cloned sibling loops may be created. All of them
  1172. /// are returned so that the newly introduced loop nest roots can be
  1173. /// identified.
  1174. static void buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
  1175. const ValueToValueMapTy &VMap, LoopInfo &LI,
  1176. SmallVectorImpl<Loop *> &NonChildClonedLoops) {
  1177. Loop *ClonedL = nullptr;
  1178. auto *OrigPH = OrigL.getLoopPreheader();
  1179. auto *OrigHeader = OrigL.getHeader();
  1180. auto *ClonedPH = cast<BasicBlock>(VMap.lookup(OrigPH));
  1181. auto *ClonedHeader = cast<BasicBlock>(VMap.lookup(OrigHeader));
  1182. // We need to know the loops of the cloned exit blocks to even compute the
  1183. // accurate parent loop. If we only clone exits to some parent of the
  1184. // original parent, we want to clone into that outer loop. We also keep track
  1185. // of the loops that our cloned exit blocks participate in.
  1186. Loop *ParentL = nullptr;
  1187. SmallVector<BasicBlock *, 4> ClonedExitsInLoops;
  1188. SmallDenseMap<BasicBlock *, Loop *, 16> ExitLoopMap;
  1189. ClonedExitsInLoops.reserve(ExitBlocks.size());
  1190. for (auto *ExitBB : ExitBlocks)
  1191. if (auto *ClonedExitBB = cast_or_null<BasicBlock>(VMap.lookup(ExitBB)))
  1192. if (Loop *ExitL = LI.getLoopFor(ExitBB)) {
  1193. ExitLoopMap[ClonedExitBB] = ExitL;
  1194. ClonedExitsInLoops.push_back(ClonedExitBB);
  1195. if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL)))
  1196. ParentL = ExitL;
  1197. }
  1198. assert((!ParentL || ParentL == OrigL.getParentLoop() ||
  1199. ParentL->contains(OrigL.getParentLoop())) &&
  1200. "The computed parent loop should always contain (or be) the parent of "
  1201. "the original loop.");
  1202. // We build the set of blocks dominated by the cloned header from the set of
  1203. // cloned blocks out of the original loop. While not all of these will
  1204. // necessarily be in the cloned loop, it is enough to establish that they
  1205. // aren't in unreachable cycles, etc.
  1206. SmallSetVector<BasicBlock *, 16> ClonedLoopBlocks;
  1207. for (auto *BB : OrigL.blocks())
  1208. if (auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB)))
  1209. ClonedLoopBlocks.insert(ClonedBB);
  1210. // Rebuild the set of blocks that will end up in the cloned loop. We may have
  1211. // skipped cloning some region of this loop which can in turn skip some of
  1212. // the backedges so we have to rebuild the blocks in the loop based on the
  1213. // backedges that remain after cloning.
  1214. SmallVector<BasicBlock *, 16> Worklist;
  1215. SmallPtrSet<BasicBlock *, 16> BlocksInClonedLoop;
  1216. for (auto *Pred : predecessors(ClonedHeader)) {
  1217. // The only possible non-loop header predecessor is the preheader because
  1218. // we know we cloned the loop in simplified form.
  1219. if (Pred == ClonedPH)
  1220. continue;
  1221. // Because the loop was in simplified form, the only non-loop predecessor
  1222. // should be the preheader.
  1223. assert(ClonedLoopBlocks.count(Pred) && "Found a predecessor of the loop "
  1224. "header other than the preheader "
  1225. "that is not part of the loop!");
  1226. // Insert this block into the loop set and on the first visit (and if it
  1227. // isn't the header we're currently walking) put it into the worklist to
  1228. // recurse through.
  1229. if (BlocksInClonedLoop.insert(Pred).second && Pred != ClonedHeader)
  1230. Worklist.push_back(Pred);
  1231. }
  1232. // If we had any backedges then there *is* a cloned loop. Put the header into
  1233. // the loop set and then walk the worklist backwards to find all the blocks
  1234. // that remain within the loop after cloning.
  1235. if (!BlocksInClonedLoop.empty()) {
  1236. BlocksInClonedLoop.insert(ClonedHeader);
  1237. while (!Worklist.empty()) {
  1238. BasicBlock *BB = Worklist.pop_back_val();
  1239. assert(BlocksInClonedLoop.count(BB) &&
  1240. "Didn't put block into the loop set!");
  1241. // Insert any predecessors that are in the possible set into the cloned
  1242. // set, and if the insert is successful, add them to the worklist. Note
  1243. // that we filter on the blocks that are definitely reachable via the
  1244. // backedge to the loop header so we may prune out dead code within the
  1245. // cloned loop.
  1246. for (auto *Pred : predecessors(BB))
  1247. if (ClonedLoopBlocks.count(Pred) &&
  1248. BlocksInClonedLoop.insert(Pred).second)
  1249. Worklist.push_back(Pred);
  1250. }
  1251. ClonedL = LI.AllocateLoop();
  1252. if (ParentL) {
  1253. ParentL->addBasicBlockToLoop(ClonedPH, LI);
  1254. ParentL->addChildLoop(ClonedL);
  1255. } else {
  1256. LI.addTopLevelLoop(ClonedL);
  1257. }
  1258. NonChildClonedLoops.push_back(ClonedL);
  1259. ClonedL->reserveBlocks(BlocksInClonedLoop.size());
  1260. // We don't want to just add the cloned loop blocks based on how we
  1261. // discovered them. The original order of blocks was carefully built in
  1262. // a way that doesn't rely on predecessor ordering. Rather than re-invent
  1263. // that logic, we just re-walk the original blocks (and those of the child
  1264. // loops) and filter them as we add them into the cloned loop.
  1265. for (auto *BB : OrigL.blocks()) {
  1266. auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB));
  1267. if (!ClonedBB || !BlocksInClonedLoop.count(ClonedBB))
  1268. continue;
  1269. // Directly add the blocks that are only in this loop.
  1270. if (LI.getLoopFor(BB) == &OrigL) {
  1271. ClonedL->addBasicBlockToLoop(ClonedBB, LI);
  1272. continue;
  1273. }
  1274. // We want to manually add it to this loop and parents.
  1275. // Registering it with LoopInfo will happen when we clone the top
  1276. // loop for this block.
  1277. for (Loop *PL = ClonedL; PL; PL = PL->getParentLoop())
  1278. PL->addBlockEntry(ClonedBB);
  1279. }
  1280. // Now add each child loop whose header remains within the cloned loop. All
  1281. // of the blocks within the loop must satisfy the same constraints as the
  1282. // header so once we pass the header checks we can just clone the entire
  1283. // child loop nest.
  1284. for (Loop *ChildL : OrigL) {
  1285. auto *ClonedChildHeader =
  1286. cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader()));
  1287. if (!ClonedChildHeader || !BlocksInClonedLoop.count(ClonedChildHeader))
  1288. continue;
  1289. #ifndef NDEBUG
  1290. // We should never have a cloned child loop header but fail to have
  1291. // all of the blocks for that child loop.
  1292. for (auto *ChildLoopBB : ChildL->blocks())
  1293. assert(BlocksInClonedLoop.count(
  1294. cast<BasicBlock>(VMap.lookup(ChildLoopBB))) &&
  1295. "Child cloned loop has a header within the cloned outer "
  1296. "loop but not all of its blocks!");
  1297. #endif
  1298. cloneLoopNest(*ChildL, ClonedL, VMap, LI);
  1299. }
  1300. }
  1301. // Now that we've handled all the components of the original loop that were
  1302. // cloned into a new loop, we still need to handle anything from the original
  1303. // loop that wasn't in a cloned loop.
  1304. // Figure out what blocks are left to place within any loop nest containing
  1305. // the unswitched loop. If we never formed a loop, the cloned PH is one of
  1306. // them.
  1307. SmallPtrSet<BasicBlock *, 16> UnloopedBlockSet;
  1308. if (BlocksInClonedLoop.empty())
  1309. UnloopedBlockSet.insert(ClonedPH);
  1310. for (auto *ClonedBB : ClonedLoopBlocks)
  1311. if (!BlocksInClonedLoop.count(ClonedBB))
  1312. UnloopedBlockSet.insert(ClonedBB);
  1313. // Copy the cloned exits and sort them in ascending loop depth, we'll work
  1314. // backwards across these to process them inside out. The order shouldn't
  1315. // matter as we're just trying to build up the map from inside-out; we use
  1316. // the map in a more stably ordered way below.
  1317. auto OrderedClonedExitsInLoops = ClonedExitsInLoops;
  1318. llvm::sort(OrderedClonedExitsInLoops, [&](BasicBlock *LHS, BasicBlock *RHS) {
  1319. return ExitLoopMap.lookup(LHS)->getLoopDepth() <
  1320. ExitLoopMap.lookup(RHS)->getLoopDepth();
  1321. });
  1322. // Populate the existing ExitLoopMap with everything reachable from each
  1323. // exit, starting from the inner most exit.
  1324. while (!UnloopedBlockSet.empty() && !OrderedClonedExitsInLoops.empty()) {
  1325. assert(Worklist.empty() && "Didn't clear worklist!");
  1326. BasicBlock *ExitBB = OrderedClonedExitsInLoops.pop_back_val();
  1327. Loop *ExitL = ExitLoopMap.lookup(ExitBB);
  1328. // Walk the CFG back until we hit the cloned PH adding everything reachable
  1329. // and in the unlooped set to this exit block's loop.
  1330. Worklist.push_back(ExitBB);
  1331. do {
  1332. BasicBlock *BB = Worklist.pop_back_val();
  1333. // We can stop recursing at the cloned preheader (if we get there).
  1334. if (BB == ClonedPH)
  1335. continue;
  1336. for (BasicBlock *PredBB : predecessors(BB)) {
  1337. // If this pred has already been moved to our set or is part of some
  1338. // (inner) loop, no update needed.
  1339. if (!UnloopedBlockSet.erase(PredBB)) {
  1340. assert(
  1341. (BlocksInClonedLoop.count(PredBB) || ExitLoopMap.count(PredBB)) &&
  1342. "Predecessor not mapped to a loop!");
  1343. continue;
  1344. }
  1345. // We just insert into the loop set here. We'll add these blocks to the
  1346. // exit loop after we build up the set in an order that doesn't rely on
  1347. // predecessor order (which in turn relies on use list order).
  1348. bool Inserted = ExitLoopMap.insert({PredBB, ExitL}).second;
  1349. (void)Inserted;
  1350. assert(Inserted && "Should only visit an unlooped block once!");
  1351. // And recurse through to its predecessors.
  1352. Worklist.push_back(PredBB);
  1353. }
  1354. } while (!Worklist.empty());
  1355. }
  1356. // Now that the ExitLoopMap gives as mapping for all the non-looping cloned
  1357. // blocks to their outer loops, walk the cloned blocks and the cloned exits
  1358. // in their original order adding them to the correct loop.
  1359. // We need a stable insertion order. We use the order of the original loop
  1360. // order and map into the correct parent loop.
  1361. for (auto *BB : llvm::concat<BasicBlock *const>(
  1362. makeArrayRef(ClonedPH), ClonedLoopBlocks, ClonedExitsInLoops))
  1363. if (Loop *OuterL = ExitLoopMap.lookup(BB))
  1364. OuterL->addBasicBlockToLoop(BB, LI);
  1365. #ifndef NDEBUG
  1366. for (auto &BBAndL : ExitLoopMap) {
  1367. auto *BB = BBAndL.first;
  1368. auto *OuterL = BBAndL.second;
  1369. assert(LI.getLoopFor(BB) == OuterL &&
  1370. "Failed to put all blocks into outer loops!");
  1371. }
  1372. #endif
  1373. // Now that all the blocks are placed into the correct containing loop in the
  1374. // absence of child loops, find all the potentially cloned child loops and
  1375. // clone them into whatever outer loop we placed their header into.
  1376. for (Loop *ChildL : OrigL) {
  1377. auto *ClonedChildHeader =
  1378. cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader()));
  1379. if (!ClonedChildHeader || BlocksInClonedLoop.count(ClonedChildHeader))
  1380. continue;
  1381. #ifndef NDEBUG
  1382. for (auto *ChildLoopBB : ChildL->blocks())
  1383. assert(VMap.count(ChildLoopBB) &&
  1384. "Cloned a child loop header but not all of that loops blocks!");
  1385. #endif
  1386. NonChildClonedLoops.push_back(cloneLoopNest(
  1387. *ChildL, ExitLoopMap.lookup(ClonedChildHeader), VMap, LI));
  1388. }
  1389. }
  1390. static void
  1391. deleteDeadClonedBlocks(Loop &L, ArrayRef<BasicBlock *> ExitBlocks,
  1392. ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
  1393. DominatorTree &DT, MemorySSAUpdater *MSSAU) {
  1394. // Find all the dead clones, and remove them from their successors.
  1395. SmallVector<BasicBlock *, 16> DeadBlocks;
  1396. for (BasicBlock *BB : llvm::concat<BasicBlock *const>(L.blocks(), ExitBlocks))
  1397. for (auto &VMap : VMaps)
  1398. if (BasicBlock *ClonedBB = cast_or_null<BasicBlock>(VMap->lookup(BB)))
  1399. if (!DT.isReachableFromEntry(ClonedBB)) {
  1400. for (BasicBlock *SuccBB : successors(ClonedBB))
  1401. SuccBB->removePredecessor(ClonedBB);
  1402. DeadBlocks.push_back(ClonedBB);
  1403. }
  1404. // Remove all MemorySSA in the dead blocks
  1405. if (MSSAU) {
  1406. SmallSetVector<BasicBlock *, 8> DeadBlockSet(DeadBlocks.begin(),
  1407. DeadBlocks.end());
  1408. MSSAU->removeBlocks(DeadBlockSet);
  1409. }
  1410. // Drop any remaining references to break cycles.
  1411. for (BasicBlock *BB : DeadBlocks)
  1412. BB->dropAllReferences();
  1413. // Erase them from the IR.
  1414. for (BasicBlock *BB : DeadBlocks)
  1415. BB->eraseFromParent();
  1416. }
  1417. static void
  1418. deleteDeadBlocksFromLoop(Loop &L,
  1419. SmallVectorImpl<BasicBlock *> &ExitBlocks,
  1420. DominatorTree &DT, LoopInfo &LI,
  1421. MemorySSAUpdater *MSSAU,
  1422. function_ref<void(Loop &, StringRef)> DestroyLoopCB) {
  1423. // Find all the dead blocks tied to this loop, and remove them from their
  1424. // successors.
  1425. SmallSetVector<BasicBlock *, 8> DeadBlockSet;
  1426. // Start with loop/exit blocks and get a transitive closure of reachable dead
  1427. // blocks.
  1428. SmallVector<BasicBlock *, 16> DeathCandidates(ExitBlocks.begin(),
  1429. ExitBlocks.end());
  1430. DeathCandidates.append(L.blocks().begin(), L.blocks().end());
  1431. while (!DeathCandidates.empty()) {
  1432. auto *BB = DeathCandidates.pop_back_val();
  1433. if (!DeadBlockSet.count(BB) && !DT.isReachableFromEntry(BB)) {
  1434. for (BasicBlock *SuccBB : successors(BB)) {
  1435. SuccBB->removePredecessor(BB);
  1436. DeathCandidates.push_back(SuccBB);
  1437. }
  1438. DeadBlockSet.insert(BB);
  1439. }
  1440. }
  1441. // Remove all MemorySSA in the dead blocks
  1442. if (MSSAU)
  1443. MSSAU->removeBlocks(DeadBlockSet);
  1444. // Filter out the dead blocks from the exit blocks list so that it can be
  1445. // used in the caller.
  1446. llvm::erase_if(ExitBlocks,
  1447. [&](BasicBlock *BB) { return DeadBlockSet.count(BB); });
  1448. // Walk from this loop up through its parents removing all of the dead blocks.
  1449. for (Loop *ParentL = &L; ParentL; ParentL = ParentL->getParentLoop()) {
  1450. for (auto *BB : DeadBlockSet)
  1451. ParentL->getBlocksSet().erase(BB);
  1452. llvm::erase_if(ParentL->getBlocksVector(),
  1453. [&](BasicBlock *BB) { return DeadBlockSet.count(BB); });
  1454. }
  1455. // Now delete the dead child loops. This raw delete will clear them
  1456. // recursively.
  1457. llvm::erase_if(L.getSubLoopsVector(), [&](Loop *ChildL) {
  1458. if (!DeadBlockSet.count(ChildL->getHeader()))
  1459. return false;
  1460. assert(llvm::all_of(ChildL->blocks(),
  1461. [&](BasicBlock *ChildBB) {
  1462. return DeadBlockSet.count(ChildBB);
  1463. }) &&
  1464. "If the child loop header is dead all blocks in the child loop must "
  1465. "be dead as well!");
  1466. DestroyLoopCB(*ChildL, ChildL->getName());
  1467. LI.destroy(ChildL);
  1468. return true;
  1469. });
  1470. // Remove the loop mappings for the dead blocks and drop all the references
  1471. // from these blocks to others to handle cyclic references as we start
  1472. // deleting the blocks themselves.
  1473. for (auto *BB : DeadBlockSet) {
  1474. // Check that the dominator tree has already been updated.
  1475. assert(!DT.getNode(BB) && "Should already have cleared domtree!");
  1476. LI.changeLoopFor(BB, nullptr);
  1477. // Drop all uses of the instructions to make sure we won't have dangling
  1478. // uses in other blocks.
  1479. for (auto &I : *BB)
  1480. if (!I.use_empty())
  1481. I.replaceAllUsesWith(UndefValue::get(I.getType()));
  1482. BB->dropAllReferences();
  1483. }
  1484. // Actually delete the blocks now that they've been fully unhooked from the
  1485. // IR.
  1486. for (auto *BB : DeadBlockSet)
  1487. BB->eraseFromParent();
  1488. }
  1489. /// Recompute the set of blocks in a loop after unswitching.
  1490. ///
  1491. /// This walks from the original headers predecessors to rebuild the loop. We
  1492. /// take advantage of the fact that new blocks can't have been added, and so we
  1493. /// filter by the original loop's blocks. This also handles potentially
  1494. /// unreachable code that we don't want to explore but might be found examining
  1495. /// the predecessors of the header.
  1496. ///
  1497. /// If the original loop is no longer a loop, this will return an empty set. If
  1498. /// it remains a loop, all the blocks within it will be added to the set
  1499. /// (including those blocks in inner loops).
  1500. static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L,
  1501. LoopInfo &LI) {
  1502. SmallPtrSet<const BasicBlock *, 16> LoopBlockSet;
  1503. auto *PH = L.getLoopPreheader();
  1504. auto *Header = L.getHeader();
  1505. // A worklist to use while walking backwards from the header.
  1506. SmallVector<BasicBlock *, 16> Worklist;
  1507. // First walk the predecessors of the header to find the backedges. This will
  1508. // form the basis of our walk.
  1509. for (auto *Pred : predecessors(Header)) {
  1510. // Skip the preheader.
  1511. if (Pred == PH)
  1512. continue;
  1513. // Because the loop was in simplified form, the only non-loop predecessor
  1514. // is the preheader.
  1515. assert(L.contains(Pred) && "Found a predecessor of the loop header other "
  1516. "than the preheader that is not part of the "
  1517. "loop!");
  1518. // Insert this block into the loop set and on the first visit and, if it
  1519. // isn't the header we're currently walking, put it into the worklist to
  1520. // recurse through.
  1521. if (LoopBlockSet.insert(Pred).second && Pred != Header)
  1522. Worklist.push_back(Pred);
  1523. }
  1524. // If no backedges were found, we're done.
  1525. if (LoopBlockSet.empty())
  1526. return LoopBlockSet;
  1527. // We found backedges, recurse through them to identify the loop blocks.
  1528. while (!Worklist.empty()) {
  1529. BasicBlock *BB = Worklist.pop_back_val();
  1530. assert(LoopBlockSet.count(BB) && "Didn't put block into the loop set!");
  1531. // No need to walk past the header.
  1532. if (BB == Header)
  1533. continue;
  1534. // Because we know the inner loop structure remains valid we can use the
  1535. // loop structure to jump immediately across the entire nested loop.
  1536. // Further, because it is in loop simplified form, we can directly jump
  1537. // to its preheader afterward.
  1538. if (Loop *InnerL = LI.getLoopFor(BB))
  1539. if (InnerL != &L) {
  1540. assert(L.contains(InnerL) &&
  1541. "Should not reach a loop *outside* this loop!");
  1542. // The preheader is the only possible predecessor of the loop so
  1543. // insert it into the set and check whether it was already handled.
  1544. auto *InnerPH = InnerL->getLoopPreheader();
  1545. assert(L.contains(InnerPH) && "Cannot contain an inner loop block "
  1546. "but not contain the inner loop "
  1547. "preheader!");
  1548. if (!LoopBlockSet.insert(InnerPH).second)
  1549. // The only way to reach the preheader is through the loop body
  1550. // itself so if it has been visited the loop is already handled.
  1551. continue;
  1552. // Insert all of the blocks (other than those already present) into
  1553. // the loop set. We expect at least the block that led us to find the
  1554. // inner loop to be in the block set, but we may also have other loop
  1555. // blocks if they were already enqueued as predecessors of some other
  1556. // outer loop block.
  1557. for (auto *InnerBB : InnerL->blocks()) {
  1558. if (InnerBB == BB) {
  1559. assert(LoopBlockSet.count(InnerBB) &&
  1560. "Block should already be in the set!");
  1561. continue;
  1562. }
  1563. LoopBlockSet.insert(InnerBB);
  1564. }
  1565. // Add the preheader to the worklist so we will continue past the
  1566. // loop body.
  1567. Worklist.push_back(InnerPH);
  1568. continue;
  1569. }
  1570. // Insert any predecessors that were in the original loop into the new
  1571. // set, and if the insert is successful, add them to the worklist.
  1572. for (auto *Pred : predecessors(BB))
  1573. if (L.contains(Pred) && LoopBlockSet.insert(Pred).second)
  1574. Worklist.push_back(Pred);
  1575. }
  1576. assert(LoopBlockSet.count(Header) && "Cannot fail to add the header!");
  1577. // We've found all the blocks participating in the loop, return our completed
  1578. // set.
  1579. return LoopBlockSet;
  1580. }
  1581. /// Rebuild a loop after unswitching removes some subset of blocks and edges.
  1582. ///
  1583. /// The removal may have removed some child loops entirely but cannot have
  1584. /// disturbed any remaining child loops. However, they may need to be hoisted
  1585. /// to the parent loop (or to be top-level loops). The original loop may be
  1586. /// completely removed.
  1587. ///
  1588. /// The sibling loops resulting from this update are returned. If the original
  1589. /// loop remains a valid loop, it will be the first entry in this list with all
  1590. /// of the newly sibling loops following it.
  1591. ///
  1592. /// Returns true if the loop remains a loop after unswitching, and false if it
  1593. /// is no longer a loop after unswitching (and should not continue to be
  1594. /// referenced).
  1595. static bool rebuildLoopAfterUnswitch(Loop &L, ArrayRef<BasicBlock *> ExitBlocks,
  1596. LoopInfo &LI,
  1597. SmallVectorImpl<Loop *> &HoistedLoops) {
  1598. auto *PH = L.getLoopPreheader();
  1599. // Compute the actual parent loop from the exit blocks. Because we may have
  1600. // pruned some exits the loop may be different from the original parent.
  1601. Loop *ParentL = nullptr;
  1602. SmallVector<Loop *, 4> ExitLoops;
  1603. SmallVector<BasicBlock *, 4> ExitsInLoops;
  1604. ExitsInLoops.reserve(ExitBlocks.size());
  1605. for (auto *ExitBB : ExitBlocks)
  1606. if (Loop *ExitL = LI.getLoopFor(ExitBB)) {
  1607. ExitLoops.push_back(ExitL);
  1608. ExitsInLoops.push_back(ExitBB);
  1609. if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL)))
  1610. ParentL = ExitL;
  1611. }
  1612. // Recompute the blocks participating in this loop. This may be empty if it
  1613. // is no longer a loop.
  1614. auto LoopBlockSet = recomputeLoopBlockSet(L, LI);
  1615. // If we still have a loop, we need to re-set the loop's parent as the exit
  1616. // block set changing may have moved it within the loop nest. Note that this
  1617. // can only happen when this loop has a parent as it can only hoist the loop
  1618. // *up* the nest.
  1619. if (!LoopBlockSet.empty() && L.getParentLoop() != ParentL) {
  1620. // Remove this loop's (original) blocks from all of the intervening loops.
  1621. for (Loop *IL = L.getParentLoop(); IL != ParentL;
  1622. IL = IL->getParentLoop()) {
  1623. IL->getBlocksSet().erase(PH);
  1624. for (auto *BB : L.blocks())
  1625. IL->getBlocksSet().erase(BB);
  1626. llvm::erase_if(IL->getBlocksVector(), [&](BasicBlock *BB) {
  1627. return BB == PH || L.contains(BB);
  1628. });
  1629. }
  1630. LI.changeLoopFor(PH, ParentL);
  1631. L.getParentLoop()->removeChildLoop(&L);
  1632. if (ParentL)
  1633. ParentL->addChildLoop(&L);
  1634. else
  1635. LI.addTopLevelLoop(&L);
  1636. }
  1637. // Now we update all the blocks which are no longer within the loop.
  1638. auto &Blocks = L.getBlocksVector();
  1639. auto BlocksSplitI =
  1640. LoopBlockSet.empty()
  1641. ? Blocks.begin()
  1642. : std::stable_partition(
  1643. Blocks.begin(), Blocks.end(),
  1644. [&](BasicBlock *BB) { return LoopBlockSet.count(BB); });
  1645. // Before we erase the list of unlooped blocks, build a set of them.
  1646. SmallPtrSet<BasicBlock *, 16> UnloopedBlocks(BlocksSplitI, Blocks.end());
  1647. if (LoopBlockSet.empty())
  1648. UnloopedBlocks.insert(PH);
  1649. // Now erase these blocks from the loop.
  1650. for (auto *BB : make_range(BlocksSplitI, Blocks.end()))
  1651. L.getBlocksSet().erase(BB);
  1652. Blocks.erase(BlocksSplitI, Blocks.end());
  1653. // Sort the exits in ascending loop depth, we'll work backwards across these
  1654. // to process them inside out.
  1655. llvm::stable_sort(ExitsInLoops, [&](BasicBlock *LHS, BasicBlock *RHS) {
  1656. return LI.getLoopDepth(LHS) < LI.getLoopDepth(RHS);
  1657. });
  1658. // We'll build up a set for each exit loop.
  1659. SmallPtrSet<BasicBlock *, 16> NewExitLoopBlocks;
  1660. Loop *PrevExitL = L.getParentLoop(); // The deepest possible exit loop.
  1661. auto RemoveUnloopedBlocksFromLoop =
  1662. [](Loop &L, SmallPtrSetImpl<BasicBlock *> &UnloopedBlocks) {
  1663. for (auto *BB : UnloopedBlocks)
  1664. L.getBlocksSet().erase(BB);
  1665. llvm::erase_if(L.getBlocksVector(), [&](BasicBlock *BB) {
  1666. return UnloopedBlocks.count(BB);
  1667. });
  1668. };
  1669. SmallVector<BasicBlock *, 16> Worklist;
  1670. while (!UnloopedBlocks.empty() && !ExitsInLoops.empty()) {
  1671. assert(Worklist.empty() && "Didn't clear worklist!");
  1672. assert(NewExitLoopBlocks.empty() && "Didn't clear loop set!");
  1673. // Grab the next exit block, in decreasing loop depth order.
  1674. BasicBlock *ExitBB = ExitsInLoops.pop_back_val();
  1675. Loop &ExitL = *LI.getLoopFor(ExitBB);
  1676. assert(ExitL.contains(&L) && "Exit loop must contain the inner loop!");
  1677. // Erase all of the unlooped blocks from the loops between the previous
  1678. // exit loop and this exit loop. This works because the ExitInLoops list is
  1679. // sorted in increasing order of loop depth and thus we visit loops in
  1680. // decreasing order of loop depth.
  1681. for (; PrevExitL != &ExitL; PrevExitL = PrevExitL->getParentLoop())
  1682. RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks);
  1683. // Walk the CFG back until we hit the cloned PH adding everything reachable
  1684. // and in the unlooped set to this exit block's loop.
  1685. Worklist.push_back(ExitBB);
  1686. do {
  1687. BasicBlock *BB = Worklist.pop_back_val();
  1688. // We can stop recursing at the cloned preheader (if we get there).
  1689. if (BB == PH)
  1690. continue;
  1691. for (BasicBlock *PredBB : predecessors(BB)) {
  1692. // If this pred has already been moved to our set or is part of some
  1693. // (inner) loop, no update needed.
  1694. if (!UnloopedBlocks.erase(PredBB)) {
  1695. assert((NewExitLoopBlocks.count(PredBB) ||
  1696. ExitL.contains(LI.getLoopFor(PredBB))) &&
  1697. "Predecessor not in a nested loop (or already visited)!");
  1698. continue;
  1699. }
  1700. // We just insert into the loop set here. We'll add these blocks to the
  1701. // exit loop after we build up the set in a deterministic order rather
  1702. // than the predecessor-influenced visit order.
  1703. bool Inserted = NewExitLoopBlocks.insert(PredBB).second;
  1704. (void)Inserted;
  1705. assert(Inserted && "Should only visit an unlooped block once!");
  1706. // And recurse through to its predecessors.
  1707. Worklist.push_back(PredBB);
  1708. }
  1709. } while (!Worklist.empty());
  1710. // If blocks in this exit loop were directly part of the original loop (as
  1711. // opposed to a child loop) update the map to point to this exit loop. This
  1712. // just updates a map and so the fact that the order is unstable is fine.
  1713. for (auto *BB : NewExitLoopBlocks)
  1714. if (Loop *BBL = LI.getLoopFor(BB))
  1715. if (BBL == &L || !L.contains(BBL))
  1716. LI.changeLoopFor(BB, &ExitL);
  1717. // We will remove the remaining unlooped blocks from this loop in the next
  1718. // iteration or below.
  1719. NewExitLoopBlocks.clear();
  1720. }
  1721. // Any remaining unlooped blocks are no longer part of any loop unless they
  1722. // are part of some child loop.
  1723. for (; PrevExitL; PrevExitL = PrevExitL->getParentLoop())
  1724. RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks);
  1725. for (auto *BB : UnloopedBlocks)
  1726. if (Loop *BBL = LI.getLoopFor(BB))
  1727. if (BBL == &L || !L.contains(BBL))
  1728. LI.changeLoopFor(BB, nullptr);
  1729. // Sink all the child loops whose headers are no longer in the loop set to
  1730. // the parent (or to be top level loops). We reach into the loop and directly
  1731. // update its subloop vector to make this batch update efficient.
  1732. auto &SubLoops = L.getSubLoopsVector();
  1733. auto SubLoopsSplitI =
  1734. LoopBlockSet.empty()
  1735. ? SubLoops.begin()
  1736. : std::stable_partition(
  1737. SubLoops.begin(), SubLoops.end(), [&](Loop *SubL) {
  1738. return LoopBlockSet.count(SubL->getHeader());
  1739. });
  1740. for (auto *HoistedL : make_range(SubLoopsSplitI, SubLoops.end())) {
  1741. HoistedLoops.push_back(HoistedL);
  1742. HoistedL->setParentLoop(nullptr);
  1743. // To compute the new parent of this hoisted loop we look at where we
  1744. // placed the preheader above. We can't lookup the header itself because we
  1745. // retained the mapping from the header to the hoisted loop. But the
  1746. // preheader and header should have the exact same new parent computed
  1747. // based on the set of exit blocks from the original loop as the preheader
  1748. // is a predecessor of the header and so reached in the reverse walk. And
  1749. // because the loops were all in simplified form the preheader of the
  1750. // hoisted loop can't be part of some *other* loop.
  1751. if (auto *NewParentL = LI.getLoopFor(HoistedL->getLoopPreheader()))
  1752. NewParentL->addChildLoop(HoistedL);
  1753. else
  1754. LI.addTopLevelLoop(HoistedL);
  1755. }
  1756. SubLoops.erase(SubLoopsSplitI, SubLoops.end());
  1757. // Actually delete the loop if nothing remained within it.
  1758. if (Blocks.empty()) {
  1759. assert(SubLoops.empty() &&
  1760. "Failed to remove all subloops from the original loop!");
  1761. if (Loop *ParentL = L.getParentLoop())
  1762. ParentL->removeChildLoop(llvm::find(*ParentL, &L));
  1763. else
  1764. LI.removeLoop(llvm::find(LI, &L));
  1765. // markLoopAsDeleted for L should be triggered by the caller (it is typically
  1766. // done by using the UnswitchCB callback).
  1767. LI.destroy(&L);
  1768. return false;
  1769. }
  1770. return true;
  1771. }
  1772. /// Helper to visit a dominator subtree, invoking a callable on each node.
  1773. ///
  1774. /// Returning false at any point will stop walking past that node of the tree.
  1775. template <typename CallableT>
  1776. void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) {
  1777. SmallVector<DomTreeNode *, 4> DomWorklist;
  1778. DomWorklist.push_back(DT[BB]);
  1779. #ifndef NDEBUG
  1780. SmallPtrSet<DomTreeNode *, 4> Visited;
  1781. Visited.insert(DT[BB]);
  1782. #endif
  1783. do {
  1784. DomTreeNode *N = DomWorklist.pop_back_val();
  1785. // Visit this node.
  1786. if (!Callable(N->getBlock()))
  1787. continue;
  1788. // Accumulate the child nodes.
  1789. for (DomTreeNode *ChildN : *N) {
  1790. assert(Visited.insert(ChildN).second &&
  1791. "Cannot visit a node twice when walking a tree!");
  1792. DomWorklist.push_back(ChildN);
  1793. }
  1794. } while (!DomWorklist.empty());
  1795. }
  1796. static void unswitchNontrivialInvariants(
  1797. Loop &L, Instruction &TI, ArrayRef<Value *> Invariants,
  1798. SmallVectorImpl<BasicBlock *> &ExitBlocks, IVConditionInfo &PartialIVInfo,
  1799. DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
  1800. function_ref<void(bool, bool, ArrayRef<Loop *>)> UnswitchCB,
  1801. ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
  1802. function_ref<void(Loop &, StringRef)> DestroyLoopCB) {
  1803. auto *ParentBB = TI.getParent();
  1804. BranchInst *BI = dyn_cast<BranchInst>(&TI);
  1805. SwitchInst *SI = BI ? nullptr : cast<SwitchInst>(&TI);
  1806. // We can only unswitch switches, conditional branches with an invariant
  1807. // condition, or combining invariant conditions with an instruction or
  1808. // partially invariant instructions.
  1809. assert((SI || (BI && BI->isConditional())) &&
  1810. "Can only unswitch switches and conditional branch!");
  1811. bool PartiallyInvariant = !PartialIVInfo.InstToDuplicate.empty();
  1812. bool FullUnswitch =
  1813. SI || (BI->getCondition() == Invariants[0] && !PartiallyInvariant);
  1814. if (FullUnswitch)
  1815. assert(Invariants.size() == 1 &&
  1816. "Cannot have other invariants with full unswitching!");
  1817. else
  1818. assert(isa<Instruction>(BI->getCondition()) &&
  1819. "Partial unswitching requires an instruction as the condition!");
  1820. if (MSSAU && VerifyMemorySSA)
  1821. MSSAU->getMemorySSA()->verifyMemorySSA();
  1822. // Constant and BBs tracking the cloned and continuing successor. When we are
  1823. // unswitching the entire condition, this can just be trivially chosen to
  1824. // unswitch towards `true`. However, when we are unswitching a set of
  1825. // invariants combined with `and` or `or` or partially invariant instructions,
  1826. // the combining operation determines the best direction to unswitch: we want
  1827. // to unswitch the direction that will collapse the branch.
  1828. bool Direction = true;
  1829. int ClonedSucc = 0;
  1830. if (!FullUnswitch) {
  1831. Value *Cond = BI->getCondition();
  1832. (void)Cond;
  1833. assert(((match(Cond, m_LogicalAnd()) ^ match(Cond, m_LogicalOr())) ||
  1834. PartiallyInvariant) &&
  1835. "Only `or`, `and`, an `select`, partially invariant instructions "
  1836. "can combine invariants being unswitched.");
  1837. if (!match(BI->getCondition(), m_LogicalOr())) {
  1838. if (match(BI->getCondition(), m_LogicalAnd()) ||
  1839. (PartiallyInvariant && !PartialIVInfo.KnownValue->isOneValue())) {
  1840. Direction = false;
  1841. ClonedSucc = 1;
  1842. }
  1843. }
  1844. }
  1845. BasicBlock *RetainedSuccBB =
  1846. BI ? BI->getSuccessor(1 - ClonedSucc) : SI->getDefaultDest();
  1847. SmallSetVector<BasicBlock *, 4> UnswitchedSuccBBs;
  1848. if (BI)
  1849. UnswitchedSuccBBs.insert(BI->getSuccessor(ClonedSucc));
  1850. else
  1851. for (auto Case : SI->cases())
  1852. if (Case.getCaseSuccessor() != RetainedSuccBB)
  1853. UnswitchedSuccBBs.insert(Case.getCaseSuccessor());
  1854. assert(!UnswitchedSuccBBs.count(RetainedSuccBB) &&
  1855. "Should not unswitch the same successor we are retaining!");
  1856. // The branch should be in this exact loop. Any inner loop's invariant branch
  1857. // should be handled by unswitching that inner loop. The caller of this
  1858. // routine should filter out any candidates that remain (but were skipped for
  1859. // whatever reason).
  1860. assert(LI.getLoopFor(ParentBB) == &L && "Branch in an inner loop!");
  1861. // Compute the parent loop now before we start hacking on things.
  1862. Loop *ParentL = L.getParentLoop();
  1863. // Get blocks in RPO order for MSSA update, before changing the CFG.
  1864. LoopBlocksRPO LBRPO(&L);
  1865. if (MSSAU)
  1866. LBRPO.perform(&LI);
  1867. // Compute the outer-most loop containing one of our exit blocks. This is the
  1868. // furthest up our loopnest which can be mutated, which we will use below to
  1869. // update things.
  1870. Loop *OuterExitL = &L;
  1871. for (auto *ExitBB : ExitBlocks) {
  1872. Loop *NewOuterExitL = LI.getLoopFor(ExitBB);
  1873. if (!NewOuterExitL) {
  1874. // We exited the entire nest with this block, so we're done.
  1875. OuterExitL = nullptr;
  1876. break;
  1877. }
  1878. if (NewOuterExitL != OuterExitL && NewOuterExitL->contains(OuterExitL))
  1879. OuterExitL = NewOuterExitL;
  1880. }
  1881. // At this point, we're definitely going to unswitch something so invalidate
  1882. // any cached information in ScalarEvolution for the outer most loop
  1883. // containing an exit block and all nested loops.
  1884. if (SE) {
  1885. if (OuterExitL)
  1886. SE->forgetLoop(OuterExitL);
  1887. else
  1888. SE->forgetTopmostLoop(&L);
  1889. }
  1890. bool InsertFreeze = false;
  1891. if (FreezeLoopUnswitchCond) {
  1892. ICFLoopSafetyInfo SafetyInfo;
  1893. SafetyInfo.computeLoopSafetyInfo(&L);
  1894. InsertFreeze = !SafetyInfo.isGuaranteedToExecute(TI, &DT, &L);
  1895. }
  1896. // If the edge from this terminator to a successor dominates that successor,
  1897. // store a map from each block in its dominator subtree to it. This lets us
  1898. // tell when cloning for a particular successor if a block is dominated by
  1899. // some *other* successor with a single data structure. We use this to
  1900. // significantly reduce cloning.
  1901. SmallDenseMap<BasicBlock *, BasicBlock *, 16> DominatingSucc;
  1902. for (auto *SuccBB : llvm::concat<BasicBlock *const>(
  1903. makeArrayRef(RetainedSuccBB), UnswitchedSuccBBs))
  1904. if (SuccBB->getUniquePredecessor() ||
  1905. llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) {
  1906. return PredBB == ParentBB || DT.dominates(SuccBB, PredBB);
  1907. }))
  1908. visitDomSubTree(DT, SuccBB, [&](BasicBlock *BB) {
  1909. DominatingSucc[BB] = SuccBB;
  1910. return true;
  1911. });
  1912. // Split the preheader, so that we know that there is a safe place to insert
  1913. // the conditional branch. We will change the preheader to have a conditional
  1914. // branch on LoopCond. The original preheader will become the split point
  1915. // between the unswitched versions, and we will have a new preheader for the
  1916. // original loop.
  1917. BasicBlock *SplitBB = L.getLoopPreheader();
  1918. BasicBlock *LoopPH = SplitEdge(SplitBB, L.getHeader(), &DT, &LI, MSSAU);
  1919. // Keep track of the dominator tree updates needed.
  1920. SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
  1921. // Clone the loop for each unswitched successor.
  1922. SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
  1923. VMaps.reserve(UnswitchedSuccBBs.size());
  1924. SmallDenseMap<BasicBlock *, BasicBlock *, 4> ClonedPHs;
  1925. for (auto *SuccBB : UnswitchedSuccBBs) {
  1926. VMaps.emplace_back(new ValueToValueMapTy());
  1927. ClonedPHs[SuccBB] = buildClonedLoopBlocks(
  1928. L, LoopPH, SplitBB, ExitBlocks, ParentBB, SuccBB, RetainedSuccBB,
  1929. DominatingSucc, *VMaps.back(), DTUpdates, AC, DT, LI, MSSAU);
  1930. }
  1931. // Drop metadata if we may break its semantics by moving this instr into the
  1932. // split block.
  1933. if (TI.getMetadata(LLVMContext::MD_make_implicit)) {
  1934. if (DropNonTrivialImplicitNullChecks)
  1935. // Do not spend time trying to understand if we can keep it, just drop it
  1936. // to save compile time.
  1937. TI.setMetadata(LLVMContext::MD_make_implicit, nullptr);
  1938. else {
  1939. // It is only legal to preserve make.implicit metadata if we are
  1940. // guaranteed no reach implicit null check after following this branch.
  1941. ICFLoopSafetyInfo SafetyInfo;
  1942. SafetyInfo.computeLoopSafetyInfo(&L);
  1943. if (!SafetyInfo.isGuaranteedToExecute(TI, &DT, &L))
  1944. TI.setMetadata(LLVMContext::MD_make_implicit, nullptr);
  1945. }
  1946. }
  1947. // The stitching of the branched code back together depends on whether we're
  1948. // doing full unswitching or not with the exception that we always want to
  1949. // nuke the initial terminator placed in the split block.
  1950. SplitBB->getTerminator()->eraseFromParent();
  1951. if (FullUnswitch) {
  1952. // Splice the terminator from the original loop and rewrite its
  1953. // successors.
  1954. SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), TI);
  1955. // Keep a clone of the terminator for MSSA updates.
  1956. Instruction *NewTI = TI.clone();
  1957. ParentBB->getInstList().push_back(NewTI);
  1958. // First wire up the moved terminator to the preheaders.
  1959. if (BI) {
  1960. BasicBlock *ClonedPH = ClonedPHs.begin()->second;
  1961. BI->setSuccessor(ClonedSucc, ClonedPH);
  1962. BI->setSuccessor(1 - ClonedSucc, LoopPH);
  1963. if (InsertFreeze) {
  1964. auto Cond = BI->getCondition();
  1965. if (!isGuaranteedNotToBeUndefOrPoison(Cond, &AC, BI, &DT))
  1966. BI->setCondition(new FreezeInst(Cond, Cond->getName() + ".fr", BI));
  1967. }
  1968. DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
  1969. } else {
  1970. assert(SI && "Must either be a branch or switch!");
  1971. // Walk the cases and directly update their successors.
  1972. assert(SI->getDefaultDest() == RetainedSuccBB &&
  1973. "Not retaining default successor!");
  1974. SI->setDefaultDest(LoopPH);
  1975. for (auto &Case : SI->cases())
  1976. if (Case.getCaseSuccessor() == RetainedSuccBB)
  1977. Case.setSuccessor(LoopPH);
  1978. else
  1979. Case.setSuccessor(ClonedPHs.find(Case.getCaseSuccessor())->second);
  1980. if (InsertFreeze) {
  1981. auto Cond = SI->getCondition();
  1982. if (!isGuaranteedNotToBeUndefOrPoison(Cond, &AC, SI, &DT))
  1983. SI->setCondition(new FreezeInst(Cond, Cond->getName() + ".fr", SI));
  1984. }
  1985. // We need to use the set to populate domtree updates as even when there
  1986. // are multiple cases pointing at the same successor we only want to
  1987. // remove and insert one edge in the domtree.
  1988. for (BasicBlock *SuccBB : UnswitchedSuccBBs)
  1989. DTUpdates.push_back(
  1990. {DominatorTree::Insert, SplitBB, ClonedPHs.find(SuccBB)->second});
  1991. }
  1992. if (MSSAU) {
  1993. DT.applyUpdates(DTUpdates);
  1994. DTUpdates.clear();
  1995. // Remove all but one edge to the retained block and all unswitched
  1996. // blocks. This is to avoid having duplicate entries in the cloned Phis,
  1997. // when we know we only keep a single edge for each case.
  1998. MSSAU->removeDuplicatePhiEdgesBetween(ParentBB, RetainedSuccBB);
  1999. for (BasicBlock *SuccBB : UnswitchedSuccBBs)
  2000. MSSAU->removeDuplicatePhiEdgesBetween(ParentBB, SuccBB);
  2001. for (auto &VMap : VMaps)
  2002. MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, *VMap,
  2003. /*IgnoreIncomingWithNoClones=*/true);
  2004. MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT);
  2005. // Remove all edges to unswitched blocks.
  2006. for (BasicBlock *SuccBB : UnswitchedSuccBBs)
  2007. MSSAU->removeEdge(ParentBB, SuccBB);
  2008. }
  2009. // Now unhook the successor relationship as we'll be replacing
  2010. // the terminator with a direct branch. This is much simpler for branches
  2011. // than switches so we handle those first.
  2012. if (BI) {
  2013. // Remove the parent as a predecessor of the unswitched successor.
  2014. assert(UnswitchedSuccBBs.size() == 1 &&
  2015. "Only one possible unswitched block for a branch!");
  2016. BasicBlock *UnswitchedSuccBB = *UnswitchedSuccBBs.begin();
  2017. UnswitchedSuccBB->removePredecessor(ParentBB,
  2018. /*KeepOneInputPHIs*/ true);
  2019. DTUpdates.push_back({DominatorTree::Delete, ParentBB, UnswitchedSuccBB});
  2020. } else {
  2021. // Note that we actually want to remove the parent block as a predecessor
  2022. // of *every* case successor. The case successor is either unswitched,
  2023. // completely eliminating an edge from the parent to that successor, or it
  2024. // is a duplicate edge to the retained successor as the retained successor
  2025. // is always the default successor and as we'll replace this with a direct
  2026. // branch we no longer need the duplicate entries in the PHI nodes.
  2027. SwitchInst *NewSI = cast<SwitchInst>(NewTI);
  2028. assert(NewSI->getDefaultDest() == RetainedSuccBB &&
  2029. "Not retaining default successor!");
  2030. for (auto &Case : NewSI->cases())
  2031. Case.getCaseSuccessor()->removePredecessor(
  2032. ParentBB,
  2033. /*KeepOneInputPHIs*/ true);
  2034. // We need to use the set to populate domtree updates as even when there
  2035. // are multiple cases pointing at the same successor we only want to
  2036. // remove and insert one edge in the domtree.
  2037. for (BasicBlock *SuccBB : UnswitchedSuccBBs)
  2038. DTUpdates.push_back({DominatorTree::Delete, ParentBB, SuccBB});
  2039. }
  2040. // After MSSAU update, remove the cloned terminator instruction NewTI.
  2041. ParentBB->getTerminator()->eraseFromParent();
  2042. // Create a new unconditional branch to the continuing block (as opposed to
  2043. // the one cloned).
  2044. BranchInst::Create(RetainedSuccBB, ParentBB);
  2045. } else {
  2046. assert(BI && "Only branches have partial unswitching.");
  2047. assert(UnswitchedSuccBBs.size() == 1 &&
  2048. "Only one possible unswitched block for a branch!");
  2049. BasicBlock *ClonedPH = ClonedPHs.begin()->second;
  2050. // When doing a partial unswitch, we have to do a bit more work to build up
  2051. // the branch in the split block.
  2052. if (PartiallyInvariant)
  2053. buildPartialInvariantUnswitchConditionalBranch(
  2054. *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU);
  2055. else
  2056. buildPartialUnswitchConditionalBranch(*SplitBB, Invariants, Direction,
  2057. *ClonedPH, *LoopPH, InsertFreeze);
  2058. DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
  2059. if (MSSAU) {
  2060. DT.applyUpdates(DTUpdates);
  2061. DTUpdates.clear();
  2062. // Perform MSSA cloning updates.
  2063. for (auto &VMap : VMaps)
  2064. MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, *VMap,
  2065. /*IgnoreIncomingWithNoClones=*/true);
  2066. MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT);
  2067. }
  2068. }
  2069. // Apply the updates accumulated above to get an up-to-date dominator tree.
  2070. DT.applyUpdates(DTUpdates);
  2071. // Now that we have an accurate dominator tree, first delete the dead cloned
  2072. // blocks so that we can accurately build any cloned loops. It is important to
  2073. // not delete the blocks from the original loop yet because we still want to
  2074. // reference the original loop to understand the cloned loop's structure.
  2075. deleteDeadClonedBlocks(L, ExitBlocks, VMaps, DT, MSSAU);
  2076. // Build the cloned loop structure itself. This may be substantially
  2077. // different from the original structure due to the simplified CFG. This also
  2078. // handles inserting all the cloned blocks into the correct loops.
  2079. SmallVector<Loop *, 4> NonChildClonedLoops;
  2080. for (std::unique_ptr<ValueToValueMapTy> &VMap : VMaps)
  2081. buildClonedLoops(L, ExitBlocks, *VMap, LI, NonChildClonedLoops);
  2082. // Now that our cloned loops have been built, we can update the original loop.
  2083. // First we delete the dead blocks from it and then we rebuild the loop
  2084. // structure taking these deletions into account.
  2085. deleteDeadBlocksFromLoop(L, ExitBlocks, DT, LI, MSSAU, DestroyLoopCB);
  2086. if (MSSAU && VerifyMemorySSA)
  2087. MSSAU->getMemorySSA()->verifyMemorySSA();
  2088. SmallVector<Loop *, 4> HoistedLoops;
  2089. bool IsStillLoop = rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops);
  2090. if (MSSAU && VerifyMemorySSA)
  2091. MSSAU->getMemorySSA()->verifyMemorySSA();
  2092. // This transformation has a high risk of corrupting the dominator tree, and
  2093. // the below steps to rebuild loop structures will result in hard to debug
  2094. // errors in that case so verify that the dominator tree is sane first.
  2095. // FIXME: Remove this when the bugs stop showing up and rely on existing
  2096. // verification steps.
  2097. assert(DT.verify(DominatorTree::VerificationLevel::Fast));
  2098. if (BI && !PartiallyInvariant) {
  2099. // If we unswitched a branch which collapses the condition to a known
  2100. // constant we want to replace all the uses of the invariants within both
  2101. // the original and cloned blocks. We do this here so that we can use the
  2102. // now updated dominator tree to identify which side the users are on.
  2103. assert(UnswitchedSuccBBs.size() == 1 &&
  2104. "Only one possible unswitched block for a branch!");
  2105. BasicBlock *ClonedPH = ClonedPHs.begin()->second;
  2106. // When considering multiple partially-unswitched invariants
  2107. // we cant just go replace them with constants in both branches.
  2108. //
  2109. // For 'AND' we infer that true branch ("continue") means true
  2110. // for each invariant operand.
  2111. // For 'OR' we can infer that false branch ("continue") means false
  2112. // for each invariant operand.
  2113. // So it happens that for multiple-partial case we dont replace
  2114. // in the unswitched branch.
  2115. bool ReplaceUnswitched =
  2116. FullUnswitch || (Invariants.size() == 1) || PartiallyInvariant;
  2117. ConstantInt *UnswitchedReplacement =
  2118. Direction ? ConstantInt::getTrue(BI->getContext())
  2119. : ConstantInt::getFalse(BI->getContext());
  2120. ConstantInt *ContinueReplacement =
  2121. Direction ? ConstantInt::getFalse(BI->getContext())
  2122. : ConstantInt::getTrue(BI->getContext());
  2123. for (Value *Invariant : Invariants) {
  2124. assert(!isa<Constant>(Invariant) &&
  2125. "Should not be replacing constant values!");
  2126. // Use make_early_inc_range here as set invalidates the iterator.
  2127. for (Use &U : llvm::make_early_inc_range(Invariant->uses())) {
  2128. Instruction *UserI = dyn_cast<Instruction>(U.getUser());
  2129. if (!UserI)
  2130. continue;
  2131. // Replace it with the 'continue' side if in the main loop body, and the
  2132. // unswitched if in the cloned blocks.
  2133. if (DT.dominates(LoopPH, UserI->getParent()))
  2134. U.set(ContinueReplacement);
  2135. else if (ReplaceUnswitched &&
  2136. DT.dominates(ClonedPH, UserI->getParent()))
  2137. U.set(UnswitchedReplacement);
  2138. }
  2139. }
  2140. }
  2141. // We can change which blocks are exit blocks of all the cloned sibling
  2142. // loops, the current loop, and any parent loops which shared exit blocks
  2143. // with the current loop. As a consequence, we need to re-form LCSSA for
  2144. // them. But we shouldn't need to re-form LCSSA for any child loops.
  2145. // FIXME: This could be made more efficient by tracking which exit blocks are
  2146. // new, and focusing on them, but that isn't likely to be necessary.
  2147. //
  2148. // In order to reasonably rebuild LCSSA we need to walk inside-out across the
  2149. // loop nest and update every loop that could have had its exits changed. We
  2150. // also need to cover any intervening loops. We add all of these loops to
  2151. // a list and sort them by loop depth to achieve this without updating
  2152. // unnecessary loops.
  2153. auto UpdateLoop = [&](Loop &UpdateL) {
  2154. #ifndef NDEBUG
  2155. UpdateL.verifyLoop();
  2156. for (Loop *ChildL : UpdateL) {
  2157. ChildL->verifyLoop();
  2158. assert(ChildL->isRecursivelyLCSSAForm(DT, LI) &&
  2159. "Perturbed a child loop's LCSSA form!");
  2160. }
  2161. #endif
  2162. // First build LCSSA for this loop so that we can preserve it when
  2163. // forming dedicated exits. We don't want to perturb some other loop's
  2164. // LCSSA while doing that CFG edit.
  2165. formLCSSA(UpdateL, DT, &LI, SE);
  2166. // For loops reached by this loop's original exit blocks we may
  2167. // introduced new, non-dedicated exits. At least try to re-form dedicated
  2168. // exits for these loops. This may fail if they couldn't have dedicated
  2169. // exits to start with.
  2170. formDedicatedExitBlocks(&UpdateL, &DT, &LI, MSSAU, /*PreserveLCSSA*/ true);
  2171. };
  2172. // For non-child cloned loops and hoisted loops, we just need to update LCSSA
  2173. // and we can do it in any order as they don't nest relative to each other.
  2174. //
  2175. // Also check if any of the loops we have updated have become top-level loops
  2176. // as that will necessitate widening the outer loop scope.
  2177. for (Loop *UpdatedL :
  2178. llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops)) {
  2179. UpdateLoop(*UpdatedL);
  2180. if (UpdatedL->isOutermost())
  2181. OuterExitL = nullptr;
  2182. }
  2183. if (IsStillLoop) {
  2184. UpdateLoop(L);
  2185. if (L.isOutermost())
  2186. OuterExitL = nullptr;
  2187. }
  2188. // If the original loop had exit blocks, walk up through the outer most loop
  2189. // of those exit blocks to update LCSSA and form updated dedicated exits.
  2190. if (OuterExitL != &L)
  2191. for (Loop *OuterL = ParentL; OuterL != OuterExitL;
  2192. OuterL = OuterL->getParentLoop())
  2193. UpdateLoop(*OuterL);
  2194. #ifndef NDEBUG
  2195. // Verify the entire loop structure to catch any incorrect updates before we
  2196. // progress in the pass pipeline.
  2197. LI.verify(DT);
  2198. #endif
  2199. // Now that we've unswitched something, make callbacks to report the changes.
  2200. // For that we need to merge together the updated loops and the cloned loops
  2201. // and check whether the original loop survived.
  2202. SmallVector<Loop *, 4> SibLoops;
  2203. for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops))
  2204. if (UpdatedL->getParentLoop() == ParentL)
  2205. SibLoops.push_back(UpdatedL);
  2206. UnswitchCB(IsStillLoop, PartiallyInvariant, SibLoops);
  2207. if (MSSAU && VerifyMemorySSA)
  2208. MSSAU->getMemorySSA()->verifyMemorySSA();
  2209. if (BI)
  2210. ++NumBranches;
  2211. else
  2212. ++NumSwitches;
  2213. }
  2214. /// Recursively compute the cost of a dominator subtree based on the per-block
  2215. /// cost map provided.
  2216. ///
  2217. /// The recursive computation is memozied into the provided DT-indexed cost map
  2218. /// to allow querying it for most nodes in the domtree without it becoming
  2219. /// quadratic.
  2220. static InstructionCost computeDomSubtreeCost(
  2221. DomTreeNode &N,
  2222. const SmallDenseMap<BasicBlock *, InstructionCost, 4> &BBCostMap,
  2223. SmallDenseMap<DomTreeNode *, InstructionCost, 4> &DTCostMap) {
  2224. // Don't accumulate cost (or recurse through) blocks not in our block cost
  2225. // map and thus not part of the duplication cost being considered.
  2226. auto BBCostIt = BBCostMap.find(N.getBlock());
  2227. if (BBCostIt == BBCostMap.end())
  2228. return 0;
  2229. // Lookup this node to see if we already computed its cost.
  2230. auto DTCostIt = DTCostMap.find(&N);
  2231. if (DTCostIt != DTCostMap.end())
  2232. return DTCostIt->second;
  2233. // If not, we have to compute it. We can't use insert above and update
  2234. // because computing the cost may insert more things into the map.
  2235. InstructionCost Cost = std::accumulate(
  2236. N.begin(), N.end(), BBCostIt->second,
  2237. [&](InstructionCost Sum, DomTreeNode *ChildN) -> InstructionCost {
  2238. return Sum + computeDomSubtreeCost(*ChildN, BBCostMap, DTCostMap);
  2239. });
  2240. bool Inserted = DTCostMap.insert({&N, Cost}).second;
  2241. (void)Inserted;
  2242. assert(Inserted && "Should not insert a node while visiting children!");
  2243. return Cost;
  2244. }
  2245. /// Turns a llvm.experimental.guard intrinsic into implicit control flow branch,
  2246. /// making the following replacement:
  2247. ///
  2248. /// --code before guard--
  2249. /// call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
  2250. /// --code after guard--
  2251. ///
  2252. /// into
  2253. ///
  2254. /// --code before guard--
  2255. /// br i1 %cond, label %guarded, label %deopt
  2256. ///
  2257. /// guarded:
  2258. /// --code after guard--
  2259. ///
  2260. /// deopt:
  2261. /// call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
  2262. /// unreachable
  2263. ///
  2264. /// It also makes all relevant DT and LI updates, so that all structures are in
  2265. /// valid state after this transform.
  2266. static BranchInst *
  2267. turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
  2268. SmallVectorImpl<BasicBlock *> &ExitBlocks,
  2269. DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU) {
  2270. SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
  2271. LLVM_DEBUG(dbgs() << "Turning " << *GI << " into a branch.\n");
  2272. BasicBlock *CheckBB = GI->getParent();
  2273. if (MSSAU && VerifyMemorySSA)
  2274. MSSAU->getMemorySSA()->verifyMemorySSA();
  2275. // Remove all CheckBB's successors from DomTree. A block can be seen among
  2276. // successors more than once, but for DomTree it should be added only once.
  2277. SmallPtrSet<BasicBlock *, 4> Successors;
  2278. for (auto *Succ : successors(CheckBB))
  2279. if (Successors.insert(Succ).second)
  2280. DTUpdates.push_back({DominatorTree::Delete, CheckBB, Succ});
  2281. Instruction *DeoptBlockTerm =
  2282. SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true);
  2283. BranchInst *CheckBI = cast<BranchInst>(CheckBB->getTerminator());
  2284. // SplitBlockAndInsertIfThen inserts control flow that branches to
  2285. // DeoptBlockTerm if the condition is true. We want the opposite.
  2286. CheckBI->swapSuccessors();
  2287. BasicBlock *GuardedBlock = CheckBI->getSuccessor(0);
  2288. GuardedBlock->setName("guarded");
  2289. CheckBI->getSuccessor(1)->setName("deopt");
  2290. BasicBlock *DeoptBlock = CheckBI->getSuccessor(1);
  2291. // We now have a new exit block.
  2292. ExitBlocks.push_back(CheckBI->getSuccessor(1));
  2293. if (MSSAU)
  2294. MSSAU->moveAllAfterSpliceBlocks(CheckBB, GuardedBlock, GI);
  2295. GI->moveBefore(DeoptBlockTerm);
  2296. GI->setArgOperand(0, ConstantInt::getFalse(GI->getContext()));
  2297. // Add new successors of CheckBB into DomTree.
  2298. for (auto *Succ : successors(CheckBB))
  2299. DTUpdates.push_back({DominatorTree::Insert, CheckBB, Succ});
  2300. // Now the blocks that used to be CheckBB's successors are GuardedBlock's
  2301. // successors.
  2302. for (auto *Succ : Successors)
  2303. DTUpdates.push_back({DominatorTree::Insert, GuardedBlock, Succ});
  2304. // Make proper changes to DT.
  2305. DT.applyUpdates(DTUpdates);
  2306. // Inform LI of a new loop block.
  2307. L.addBasicBlockToLoop(GuardedBlock, LI);
  2308. if (MSSAU) {
  2309. MemoryDef *MD = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(GI));
  2310. MSSAU->moveToPlace(MD, DeoptBlock, MemorySSA::BeforeTerminator);
  2311. if (VerifyMemorySSA)
  2312. MSSAU->getMemorySSA()->verifyMemorySSA();
  2313. }
  2314. ++NumGuards;
  2315. return CheckBI;
  2316. }
  2317. /// Cost multiplier is a way to limit potentially exponential behavior
  2318. /// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch
  2319. /// candidates available. Also accounting for the number of "sibling" loops with
  2320. /// the idea to account for previous unswitches that already happened on this
  2321. /// cluster of loops. There was an attempt to keep this formula simple,
  2322. /// just enough to limit the worst case behavior. Even if it is not that simple
  2323. /// now it is still not an attempt to provide a detailed heuristic size
  2324. /// prediction.
  2325. ///
  2326. /// TODO: Make a proper accounting of "explosion" effect for all kinds of
  2327. /// unswitch candidates, making adequate predictions instead of wild guesses.
  2328. /// That requires knowing not just the number of "remaining" candidates but
  2329. /// also costs of unswitching for each of these candidates.
  2330. static int CalculateUnswitchCostMultiplier(
  2331. Instruction &TI, Loop &L, LoopInfo &LI, DominatorTree &DT,
  2332. ArrayRef<std::pair<Instruction *, TinyPtrVector<Value *>>>
  2333. UnswitchCandidates) {
  2334. // Guards and other exiting conditions do not contribute to exponential
  2335. // explosion as soon as they dominate the latch (otherwise there might be
  2336. // another path to the latch remaining that does not allow to eliminate the
  2337. // loop copy on unswitch).
  2338. BasicBlock *Latch = L.getLoopLatch();
  2339. BasicBlock *CondBlock = TI.getParent();
  2340. if (DT.dominates(CondBlock, Latch) &&
  2341. (isGuard(&TI) ||
  2342. llvm::count_if(successors(&TI), [&L](BasicBlock *SuccBB) {
  2343. return L.contains(SuccBB);
  2344. }) <= 1)) {
  2345. NumCostMultiplierSkipped++;
  2346. return 1;
  2347. }
  2348. auto *ParentL = L.getParentLoop();
  2349. int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size()
  2350. : std::distance(LI.begin(), LI.end()));
  2351. // Count amount of clones that all the candidates might cause during
  2352. // unswitching. Branch/guard counts as 1, switch counts as log2 of its cases.
  2353. int UnswitchedClones = 0;
  2354. for (auto Candidate : UnswitchCandidates) {
  2355. Instruction *CI = Candidate.first;
  2356. BasicBlock *CondBlock = CI->getParent();
  2357. bool SkipExitingSuccessors = DT.dominates(CondBlock, Latch);
  2358. if (isGuard(CI)) {
  2359. if (!SkipExitingSuccessors)
  2360. UnswitchedClones++;
  2361. continue;
  2362. }
  2363. int NonExitingSuccessors = llvm::count_if(
  2364. successors(CondBlock), [SkipExitingSuccessors, &L](BasicBlock *SuccBB) {
  2365. return !SkipExitingSuccessors || L.contains(SuccBB);
  2366. });
  2367. UnswitchedClones += Log2_32(NonExitingSuccessors);
  2368. }
  2369. // Ignore up to the "unscaled candidates" number of unswitch candidates
  2370. // when calculating the power-of-two scaling of the cost. The main idea
  2371. // with this control is to allow a small number of unswitches to happen
  2372. // and rely more on siblings multiplier (see below) when the number
  2373. // of candidates is small.
  2374. unsigned ClonesPower =
  2375. std::max(UnswitchedClones - (int)UnswitchNumInitialUnscaledCandidates, 0);
  2376. // Allowing top-level loops to spread a bit more than nested ones.
  2377. int SiblingsMultiplier =
  2378. std::max((ParentL ? SiblingsCount
  2379. : SiblingsCount / (int)UnswitchSiblingsToplevelDiv),
  2380. 1);
  2381. // Compute the cost multiplier in a way that won't overflow by saturating
  2382. // at an upper bound.
  2383. int CostMultiplier;
  2384. if (ClonesPower > Log2_32(UnswitchThreshold) ||
  2385. SiblingsMultiplier > UnswitchThreshold)
  2386. CostMultiplier = UnswitchThreshold;
  2387. else
  2388. CostMultiplier = std::min(SiblingsMultiplier * (1 << ClonesPower),
  2389. (int)UnswitchThreshold);
  2390. LLVM_DEBUG(dbgs() << " Computed multiplier " << CostMultiplier
  2391. << " (siblings " << SiblingsMultiplier << " * clones "
  2392. << (1 << ClonesPower) << ")"
  2393. << " for unswitch candidate: " << TI << "\n");
  2394. return CostMultiplier;
  2395. }
  2396. static bool unswitchBestCondition(
  2397. Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
  2398. AAResults &AA, TargetTransformInfo &TTI,
  2399. function_ref<void(bool, bool, ArrayRef<Loop *>)> UnswitchCB,
  2400. ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
  2401. function_ref<void(Loop &, StringRef)> DestroyLoopCB) {
  2402. // Collect all invariant conditions within this loop (as opposed to an inner
  2403. // loop which would be handled when visiting that inner loop).
  2404. SmallVector<std::pair<Instruction *, TinyPtrVector<Value *>>, 4>
  2405. UnswitchCandidates;
  2406. // Whether or not we should also collect guards in the loop.
  2407. bool CollectGuards = false;
  2408. if (UnswitchGuards) {
  2409. auto *GuardDecl = L.getHeader()->getParent()->getParent()->getFunction(
  2410. Intrinsic::getName(Intrinsic::experimental_guard));
  2411. if (GuardDecl && !GuardDecl->use_empty())
  2412. CollectGuards = true;
  2413. }
  2414. IVConditionInfo PartialIVInfo;
  2415. for (auto *BB : L.blocks()) {
  2416. if (LI.getLoopFor(BB) != &L)
  2417. continue;
  2418. if (CollectGuards)
  2419. for (auto &I : *BB)
  2420. if (isGuard(&I)) {
  2421. auto *Cond = cast<IntrinsicInst>(&I)->getArgOperand(0);
  2422. // TODO: Support AND, OR conditions and partial unswitching.
  2423. if (!isa<Constant>(Cond) && L.isLoopInvariant(Cond))
  2424. UnswitchCandidates.push_back({&I, {Cond}});
  2425. }
  2426. if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
  2427. // We can only consider fully loop-invariant switch conditions as we need
  2428. // to completely eliminate the switch after unswitching.
  2429. if (!isa<Constant>(SI->getCondition()) &&
  2430. L.isLoopInvariant(SI->getCondition()) && !BB->getUniqueSuccessor())
  2431. UnswitchCandidates.push_back({SI, {SI->getCondition()}});
  2432. continue;
  2433. }
  2434. auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
  2435. if (!BI || !BI->isConditional() || isa<Constant>(BI->getCondition()) ||
  2436. BI->getSuccessor(0) == BI->getSuccessor(1))
  2437. continue;
  2438. // If BI's condition is 'select _, true, false', simplify it to confuse
  2439. // matchers
  2440. Value *Cond = BI->getCondition(), *CondNext;
  2441. while (match(Cond, m_Select(m_Value(CondNext), m_One(), m_Zero())))
  2442. Cond = CondNext;
  2443. BI->setCondition(Cond);
  2444. if (isa<Constant>(Cond))
  2445. continue;
  2446. if (L.isLoopInvariant(BI->getCondition())) {
  2447. UnswitchCandidates.push_back({BI, {BI->getCondition()}});
  2448. continue;
  2449. }
  2450. Instruction &CondI = *cast<Instruction>(BI->getCondition());
  2451. if (match(&CondI, m_CombineOr(m_LogicalAnd(), m_LogicalOr()))) {
  2452. TinyPtrVector<Value *> Invariants =
  2453. collectHomogenousInstGraphLoopInvariants(L, CondI, LI);
  2454. if (Invariants.empty())
  2455. continue;
  2456. UnswitchCandidates.push_back({BI, std::move(Invariants)});
  2457. continue;
  2458. }
  2459. }
  2460. Instruction *PartialIVCondBranch = nullptr;
  2461. if (MSSAU && !findOptionMDForLoop(&L, "llvm.loop.unswitch.partial.disable") &&
  2462. !any_of(UnswitchCandidates, [&L](auto &TerminatorAndInvariants) {
  2463. return TerminatorAndInvariants.first == L.getHeader()->getTerminator();
  2464. })) {
  2465. MemorySSA *MSSA = MSSAU->getMemorySSA();
  2466. if (auto Info = hasPartialIVCondition(L, MSSAThreshold, *MSSA, AA)) {
  2467. LLVM_DEBUG(
  2468. dbgs() << "simple-loop-unswitch: Found partially invariant condition "
  2469. << *Info->InstToDuplicate[0] << "\n");
  2470. PartialIVInfo = *Info;
  2471. PartialIVCondBranch = L.getHeader()->getTerminator();
  2472. TinyPtrVector<Value *> ValsToDuplicate;
  2473. for (auto *Inst : Info->InstToDuplicate)
  2474. ValsToDuplicate.push_back(Inst);
  2475. UnswitchCandidates.push_back(
  2476. {L.getHeader()->getTerminator(), std::move(ValsToDuplicate)});
  2477. }
  2478. }
  2479. // If we didn't find any candidates, we're done.
  2480. if (UnswitchCandidates.empty())
  2481. return false;
  2482. // Check if there are irreducible CFG cycles in this loop. If so, we cannot
  2483. // easily unswitch non-trivial edges out of the loop. Doing so might turn the
  2484. // irreducible control flow into reducible control flow and introduce new
  2485. // loops "out of thin air". If we ever discover important use cases for doing
  2486. // this, we can add support to loop unswitch, but it is a lot of complexity
  2487. // for what seems little or no real world benefit.
  2488. LoopBlocksRPO RPOT(&L);
  2489. RPOT.perform(&LI);
  2490. if (containsIrreducibleCFG<const BasicBlock *>(RPOT, LI))
  2491. return false;
  2492. SmallVector<BasicBlock *, 4> ExitBlocks;
  2493. L.getUniqueExitBlocks(ExitBlocks);
  2494. // We cannot unswitch if exit blocks contain a cleanuppad/catchswitch
  2495. // instruction as we don't know how to split those exit blocks.
  2496. // FIXME: We should teach SplitBlock to handle this and remove this
  2497. // restriction.
  2498. for (auto *ExitBB : ExitBlocks) {
  2499. auto *I = ExitBB->getFirstNonPHI();
  2500. if (isa<CleanupPadInst>(I) || isa<CatchSwitchInst>(I)) {
  2501. LLVM_DEBUG(dbgs() << "Cannot unswitch because of cleanuppad/catchswitch "
  2502. "in exit block\n");
  2503. return false;
  2504. }
  2505. }
  2506. LLVM_DEBUG(
  2507. dbgs() << "Considering " << UnswitchCandidates.size()
  2508. << " non-trivial loop invariant conditions for unswitching.\n");
  2509. // Given that unswitching these terminators will require duplicating parts of
  2510. // the loop, so we need to be able to model that cost. Compute the ephemeral
  2511. // values and set up a data structure to hold per-BB costs. We cache each
  2512. // block's cost so that we don't recompute this when considering different
  2513. // subsets of the loop for duplication during unswitching.
  2514. SmallPtrSet<const Value *, 4> EphValues;
  2515. CodeMetrics::collectEphemeralValues(&L, &AC, EphValues);
  2516. SmallDenseMap<BasicBlock *, InstructionCost, 4> BBCostMap;
  2517. // Compute the cost of each block, as well as the total loop cost. Also, bail
  2518. // out if we see instructions which are incompatible with loop unswitching
  2519. // (convergent, noduplicate, or cross-basic-block tokens).
  2520. // FIXME: We might be able to safely handle some of these in non-duplicated
  2521. // regions.
  2522. TargetTransformInfo::TargetCostKind CostKind =
  2523. L.getHeader()->getParent()->hasMinSize()
  2524. ? TargetTransformInfo::TCK_CodeSize
  2525. : TargetTransformInfo::TCK_SizeAndLatency;
  2526. InstructionCost LoopCost = 0;
  2527. for (auto *BB : L.blocks()) {
  2528. InstructionCost Cost = 0;
  2529. for (auto &I : *BB) {
  2530. if (EphValues.count(&I))
  2531. continue;
  2532. if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB))
  2533. return false;
  2534. if (auto *CB = dyn_cast<CallBase>(&I))
  2535. if (CB->isConvergent() || CB->cannotDuplicate())
  2536. return false;
  2537. Cost += TTI.getUserCost(&I, CostKind);
  2538. }
  2539. assert(Cost >= 0 && "Must not have negative costs!");
  2540. LoopCost += Cost;
  2541. assert(LoopCost >= 0 && "Must not have negative loop costs!");
  2542. BBCostMap[BB] = Cost;
  2543. }
  2544. LLVM_DEBUG(dbgs() << " Total loop cost: " << LoopCost << "\n");
  2545. // Now we find the best candidate by searching for the one with the following
  2546. // properties in order:
  2547. //
  2548. // 1) An unswitching cost below the threshold
  2549. // 2) The smallest number of duplicated unswitch candidates (to avoid
  2550. // creating redundant subsequent unswitching)
  2551. // 3) The smallest cost after unswitching.
  2552. //
  2553. // We prioritize reducing fanout of unswitch candidates provided the cost
  2554. // remains below the threshold because this has a multiplicative effect.
  2555. //
  2556. // This requires memoizing each dominator subtree to avoid redundant work.
  2557. //
  2558. // FIXME: Need to actually do the number of candidates part above.
  2559. SmallDenseMap<DomTreeNode *, InstructionCost, 4> DTCostMap;
  2560. // Given a terminator which might be unswitched, computes the non-duplicated
  2561. // cost for that terminator.
  2562. auto ComputeUnswitchedCost = [&](Instruction &TI,
  2563. bool FullUnswitch) -> InstructionCost {
  2564. BasicBlock &BB = *TI.getParent();
  2565. SmallPtrSet<BasicBlock *, 4> Visited;
  2566. InstructionCost Cost = 0;
  2567. for (BasicBlock *SuccBB : successors(&BB)) {
  2568. // Don't count successors more than once.
  2569. if (!Visited.insert(SuccBB).second)
  2570. continue;
  2571. // If this is a partial unswitch candidate, then it must be a conditional
  2572. // branch with a condition of either `or`, `and`, their corresponding
  2573. // select forms or partially invariant instructions. In that case, one of
  2574. // the successors is necessarily duplicated, so don't even try to remove
  2575. // its cost.
  2576. if (!FullUnswitch) {
  2577. auto &BI = cast<BranchInst>(TI);
  2578. if (match(BI.getCondition(), m_LogicalAnd())) {
  2579. if (SuccBB == BI.getSuccessor(1))
  2580. continue;
  2581. } else if (match(BI.getCondition(), m_LogicalOr())) {
  2582. if (SuccBB == BI.getSuccessor(0))
  2583. continue;
  2584. } else if ((PartialIVInfo.KnownValue->isOneValue() &&
  2585. SuccBB == BI.getSuccessor(0)) ||
  2586. (!PartialIVInfo.KnownValue->isOneValue() &&
  2587. SuccBB == BI.getSuccessor(1)))
  2588. continue;
  2589. }
  2590. // This successor's domtree will not need to be duplicated after
  2591. // unswitching if the edge to the successor dominates it (and thus the
  2592. // entire tree). This essentially means there is no other path into this
  2593. // subtree and so it will end up live in only one clone of the loop.
  2594. if (SuccBB->getUniquePredecessor() ||
  2595. llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) {
  2596. return PredBB == &BB || DT.dominates(SuccBB, PredBB);
  2597. })) {
  2598. Cost += computeDomSubtreeCost(*DT[SuccBB], BBCostMap, DTCostMap);
  2599. assert(Cost <= LoopCost &&
  2600. "Non-duplicated cost should never exceed total loop cost!");
  2601. }
  2602. }
  2603. // Now scale the cost by the number of unique successors minus one. We
  2604. // subtract one because there is already at least one copy of the entire
  2605. // loop. This is computing the new cost of unswitching a condition.
  2606. // Note that guards always have 2 unique successors that are implicit and
  2607. // will be materialized if we decide to unswitch it.
  2608. int SuccessorsCount = isGuard(&TI) ? 2 : Visited.size();
  2609. assert(SuccessorsCount > 1 &&
  2610. "Cannot unswitch a condition without multiple distinct successors!");
  2611. return (LoopCost - Cost) * (SuccessorsCount - 1);
  2612. };
  2613. Instruction *BestUnswitchTI = nullptr;
  2614. InstructionCost BestUnswitchCost = 0;
  2615. ArrayRef<Value *> BestUnswitchInvariants;
  2616. for (auto &TerminatorAndInvariants : UnswitchCandidates) {
  2617. Instruction &TI = *TerminatorAndInvariants.first;
  2618. ArrayRef<Value *> Invariants = TerminatorAndInvariants.second;
  2619. BranchInst *BI = dyn_cast<BranchInst>(&TI);
  2620. InstructionCost CandidateCost = ComputeUnswitchedCost(
  2621. TI, /*FullUnswitch*/ !BI || (Invariants.size() == 1 &&
  2622. Invariants[0] == BI->getCondition()));
  2623. // Calculate cost multiplier which is a tool to limit potentially
  2624. // exponential behavior of loop-unswitch.
  2625. if (EnableUnswitchCostMultiplier) {
  2626. int CostMultiplier =
  2627. CalculateUnswitchCostMultiplier(TI, L, LI, DT, UnswitchCandidates);
  2628. assert(
  2629. (CostMultiplier > 0 && CostMultiplier <= UnswitchThreshold) &&
  2630. "cost multiplier needs to be in the range of 1..UnswitchThreshold");
  2631. CandidateCost *= CostMultiplier;
  2632. LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost
  2633. << " (multiplier: " << CostMultiplier << ")"
  2634. << " for unswitch candidate: " << TI << "\n");
  2635. } else {
  2636. LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost
  2637. << " for unswitch candidate: " << TI << "\n");
  2638. }
  2639. if (!BestUnswitchTI || CandidateCost < BestUnswitchCost) {
  2640. BestUnswitchTI = &TI;
  2641. BestUnswitchCost = CandidateCost;
  2642. BestUnswitchInvariants = Invariants;
  2643. }
  2644. }
  2645. assert(BestUnswitchTI && "Failed to find loop unswitch candidate");
  2646. if (BestUnswitchCost >= UnswitchThreshold) {
  2647. LLVM_DEBUG(dbgs() << "Cannot unswitch, lowest cost found: "
  2648. << BestUnswitchCost << "\n");
  2649. return false;
  2650. }
  2651. if (BestUnswitchTI != PartialIVCondBranch)
  2652. PartialIVInfo.InstToDuplicate.clear();
  2653. // If the best candidate is a guard, turn it into a branch.
  2654. if (isGuard(BestUnswitchTI))
  2655. BestUnswitchTI = turnGuardIntoBranch(cast<IntrinsicInst>(BestUnswitchTI), L,
  2656. ExitBlocks, DT, LI, MSSAU);
  2657. LLVM_DEBUG(dbgs() << " Unswitching non-trivial (cost = "
  2658. << BestUnswitchCost << ") terminator: " << *BestUnswitchTI
  2659. << "\n");
  2660. unswitchNontrivialInvariants(L, *BestUnswitchTI, BestUnswitchInvariants,
  2661. ExitBlocks, PartialIVInfo, DT, LI, AC,
  2662. UnswitchCB, SE, MSSAU, DestroyLoopCB);
  2663. return true;
  2664. }
  2665. /// Unswitch control flow predicated on loop invariant conditions.
  2666. ///
  2667. /// This first hoists all branches or switches which are trivial (IE, do not
  2668. /// require duplicating any part of the loop) out of the loop body. It then
  2669. /// looks at other loop invariant control flows and tries to unswitch those as
  2670. /// well by cloning the loop if the result is small enough.
  2671. ///
  2672. /// The `DT`, `LI`, `AC`, `AA`, `TTI` parameters are required analyses that are
  2673. /// also updated based on the unswitch. The `MSSA` analysis is also updated if
  2674. /// valid (i.e. its use is enabled).
  2675. ///
  2676. /// If either `NonTrivial` is true or the flag `EnableNonTrivialUnswitch` is
  2677. /// true, we will attempt to do non-trivial unswitching as well as trivial
  2678. /// unswitching.
  2679. ///
  2680. /// The `UnswitchCB` callback provided will be run after unswitching is
  2681. /// complete, with the first parameter set to `true` if the provided loop
  2682. /// remains a loop, and a list of new sibling loops created.
  2683. ///
  2684. /// If `SE` is non-null, we will update that analysis based on the unswitching
  2685. /// done.
  2686. static bool
  2687. unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
  2688. AAResults &AA, TargetTransformInfo &TTI, bool Trivial,
  2689. bool NonTrivial,
  2690. function_ref<void(bool, bool, ArrayRef<Loop *>)> UnswitchCB,
  2691. ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
  2692. function_ref<void(Loop &, StringRef)> DestroyLoopCB) {
  2693. assert(L.isRecursivelyLCSSAForm(DT, LI) &&
  2694. "Loops must be in LCSSA form before unswitching.");
  2695. // Must be in loop simplified form: we need a preheader and dedicated exits.
  2696. if (!L.isLoopSimplifyForm())
  2697. return false;
  2698. // Try trivial unswitch first before loop over other basic blocks in the loop.
  2699. if (Trivial && unswitchAllTrivialConditions(L, DT, LI, SE, MSSAU)) {
  2700. // If we unswitched successfully we will want to clean up the loop before
  2701. // processing it further so just mark it as unswitched and return.
  2702. UnswitchCB(/*CurrentLoopValid*/ true, false, {});
  2703. return true;
  2704. }
  2705. // Check whether we should continue with non-trivial conditions.
  2706. // EnableNonTrivialUnswitch: Global variable that forces non-trivial
  2707. // unswitching for testing and debugging.
  2708. // NonTrivial: Parameter that enables non-trivial unswitching for this
  2709. // invocation of the transform. But this should be allowed only
  2710. // for targets without branch divergence.
  2711. //
  2712. // FIXME: If divergence analysis becomes available to a loop
  2713. // transform, we should allow unswitching for non-trivial uniform
  2714. // branches even on targets that have divergence.
  2715. // https://bugs.llvm.org/show_bug.cgi?id=48819
  2716. bool ContinueWithNonTrivial =
  2717. EnableNonTrivialUnswitch || (NonTrivial && !TTI.hasBranchDivergence());
  2718. if (!ContinueWithNonTrivial)
  2719. return false;
  2720. // Skip non-trivial unswitching for optsize functions.
  2721. if (L.getHeader()->getParent()->hasOptSize())
  2722. return false;
  2723. // Skip non-trivial unswitching for loops that cannot be cloned.
  2724. if (!L.isSafeToClone())
  2725. return false;
  2726. // For non-trivial unswitching, because it often creates new loops, we rely on
  2727. // the pass manager to iterate on the loops rather than trying to immediately
  2728. // reach a fixed point. There is no substantial advantage to iterating
  2729. // internally, and if any of the new loops are simplified enough to contain
  2730. // trivial unswitching we want to prefer those.
  2731. // Try to unswitch the best invariant condition. We prefer this full unswitch to
  2732. // a partial unswitch when possible below the threshold.
  2733. if (unswitchBestCondition(L, DT, LI, AC, AA, TTI, UnswitchCB, SE, MSSAU,
  2734. DestroyLoopCB))
  2735. return true;
  2736. // No other opportunities to unswitch.
  2737. return false;
  2738. }
  2739. PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
  2740. LoopStandardAnalysisResults &AR,
  2741. LPMUpdater &U) {
  2742. Function &F = *L.getHeader()->getParent();
  2743. (void)F;
  2744. LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L
  2745. << "\n");
  2746. // Save the current loop name in a variable so that we can report it even
  2747. // after it has been deleted.
  2748. std::string LoopName = std::string(L.getName());
  2749. auto UnswitchCB = [&L, &U, &LoopName](bool CurrentLoopValid,
  2750. bool PartiallyInvariant,
  2751. ArrayRef<Loop *> NewLoops) {
  2752. // If we did a non-trivial unswitch, we have added new (cloned) loops.
  2753. if (!NewLoops.empty())
  2754. U.addSiblingLoops(NewLoops);
  2755. // If the current loop remains valid, we should revisit it to catch any
  2756. // other unswitch opportunities. Otherwise, we need to mark it as deleted.
  2757. if (CurrentLoopValid) {
  2758. if (PartiallyInvariant) {
  2759. // Mark the new loop as partially unswitched, to avoid unswitching on
  2760. // the same condition again.
  2761. auto &Context = L.getHeader()->getContext();
  2762. MDNode *DisableUnswitchMD = MDNode::get(
  2763. Context,
  2764. MDString::get(Context, "llvm.loop.unswitch.partial.disable"));
  2765. MDNode *NewLoopID = makePostTransformationMetadata(
  2766. Context, L.getLoopID(), {"llvm.loop.unswitch.partial"},
  2767. {DisableUnswitchMD});
  2768. L.setLoopID(NewLoopID);
  2769. } else
  2770. U.revisitCurrentLoop();
  2771. } else
  2772. U.markLoopAsDeleted(L, LoopName);
  2773. };
  2774. auto DestroyLoopCB = [&U](Loop &L, StringRef Name) {
  2775. U.markLoopAsDeleted(L, Name);
  2776. };
  2777. Optional<MemorySSAUpdater> MSSAU;
  2778. if (AR.MSSA) {
  2779. MSSAU = MemorySSAUpdater(AR.MSSA);
  2780. if (VerifyMemorySSA)
  2781. AR.MSSA->verifyMemorySSA();
  2782. }
  2783. if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.AA, AR.TTI, Trivial, NonTrivial,
  2784. UnswitchCB, &AR.SE,
  2785. MSSAU.hasValue() ? MSSAU.getPointer() : nullptr,
  2786. DestroyLoopCB))
  2787. return PreservedAnalyses::all();
  2788. if (AR.MSSA && VerifyMemorySSA)
  2789. AR.MSSA->verifyMemorySSA();
  2790. // Historically this pass has had issues with the dominator tree so verify it
  2791. // in asserts builds.
  2792. assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast));
  2793. auto PA = getLoopPassPreservedAnalyses();
  2794. if (AR.MSSA)
  2795. PA.preserve<MemorySSAAnalysis>();
  2796. return PA;
  2797. }
  2798. void SimpleLoopUnswitchPass::printPipeline(
  2799. raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
  2800. static_cast<PassInfoMixin<SimpleLoopUnswitchPass> *>(this)->printPipeline(
  2801. OS, MapClassName2PassName);
  2802. OS << "<";
  2803. OS << (NonTrivial ? "" : "no-") << "nontrivial;";
  2804. OS << (Trivial ? "" : "no-") << "trivial";
  2805. OS << ">";
  2806. }
  2807. namespace {
  2808. class SimpleLoopUnswitchLegacyPass : public LoopPass {
  2809. bool NonTrivial;
  2810. public:
  2811. static char ID; // Pass ID, replacement for typeid
  2812. explicit SimpleLoopUnswitchLegacyPass(bool NonTrivial = false)
  2813. : LoopPass(ID), NonTrivial(NonTrivial) {
  2814. initializeSimpleLoopUnswitchLegacyPassPass(
  2815. *PassRegistry::getPassRegistry());
  2816. }
  2817. bool runOnLoop(Loop *L, LPPassManager &LPM) override;
  2818. void getAnalysisUsage(AnalysisUsage &AU) const override {
  2819. AU.addRequired<AssumptionCacheTracker>();
  2820. AU.addRequired<TargetTransformInfoWrapperPass>();
  2821. AU.addRequired<MemorySSAWrapperPass>();
  2822. AU.addPreserved<MemorySSAWrapperPass>();
  2823. getLoopAnalysisUsage(AU);
  2824. }
  2825. };
  2826. } // end anonymous namespace
  2827. bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
  2828. if (skipLoop(L))
  2829. return false;
  2830. Function &F = *L->getHeader()->getParent();
  2831. LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L
  2832. << "\n");
  2833. auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  2834. auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  2835. auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
  2836. auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
  2837. auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
  2838. MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
  2839. MemorySSAUpdater MSSAU(MSSA);
  2840. auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
  2841. auto *SE = SEWP ? &SEWP->getSE() : nullptr;
  2842. auto UnswitchCB = [&L, &LPM](bool CurrentLoopValid, bool PartiallyInvariant,
  2843. ArrayRef<Loop *> NewLoops) {
  2844. // If we did a non-trivial unswitch, we have added new (cloned) loops.
  2845. for (auto *NewL : NewLoops)
  2846. LPM.addLoop(*NewL);
  2847. // If the current loop remains valid, re-add it to the queue. This is
  2848. // a little wasteful as we'll finish processing the current loop as well,
  2849. // but it is the best we can do in the old PM.
  2850. if (CurrentLoopValid) {
  2851. // If the current loop has been unswitched using a partially invariant
  2852. // condition, we should not re-add the current loop to avoid unswitching
  2853. // on the same condition again.
  2854. if (!PartiallyInvariant)
  2855. LPM.addLoop(*L);
  2856. } else
  2857. LPM.markLoopAsDeleted(*L);
  2858. };
  2859. auto DestroyLoopCB = [&LPM](Loop &L, StringRef /* Name */) {
  2860. LPM.markLoopAsDeleted(L);
  2861. };
  2862. if (VerifyMemorySSA)
  2863. MSSA->verifyMemorySSA();
  2864. bool Changed = unswitchLoop(*L, DT, LI, AC, AA, TTI, true, NonTrivial,
  2865. UnswitchCB, SE, &MSSAU, DestroyLoopCB);
  2866. if (VerifyMemorySSA)
  2867. MSSA->verifyMemorySSA();
  2868. // Historically this pass has had issues with the dominator tree so verify it
  2869. // in asserts builds.
  2870. assert(DT.verify(DominatorTree::VerificationLevel::Fast));
  2871. return Changed;
  2872. }
  2873. char SimpleLoopUnswitchLegacyPass::ID = 0;
  2874. INITIALIZE_PASS_BEGIN(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
  2875. "Simple unswitch loops", false, false)
  2876. INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
  2877. INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
  2878. INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
  2879. INITIALIZE_PASS_DEPENDENCY(LoopPass)
  2880. INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
  2881. INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
  2882. INITIALIZE_PASS_END(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
  2883. "Simple unswitch loops", false, false)
  2884. Pass *llvm::createSimpleLoopUnswitchLegacyPass(bool NonTrivial) {
  2885. return new SimpleLoopUnswitchLegacyPass(NonTrivial);
  2886. }