LoopUnrollPass.cpp 67 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670
  1. //===- LoopUnroll.cpp - Loop unroller pass --------------------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This pass implements a simple loop unroller. It works best when loops have
  10. // been canonicalized by the -indvars pass, allowing it to determine the trip
  11. // counts of loops easily.
  12. //===----------------------------------------------------------------------===//
  13. #include "llvm/Transforms/Scalar/LoopUnrollPass.h"
  14. #include "llvm/ADT/DenseMap.h"
  15. #include "llvm/ADT/DenseMapInfo.h"
  16. #include "llvm/ADT/DenseSet.h"
  17. #include "llvm/ADT/STLExtras.h"
  18. #include "llvm/ADT/SetVector.h"
  19. #include "llvm/ADT/SmallPtrSet.h"
  20. #include "llvm/ADT/SmallVector.h"
  21. #include "llvm/ADT/StringRef.h"
  22. #include "llvm/Analysis/AssumptionCache.h"
  23. #include "llvm/Analysis/BlockFrequencyInfo.h"
  24. #include "llvm/Analysis/CodeMetrics.h"
  25. #include "llvm/Analysis/LoopAnalysisManager.h"
  26. #include "llvm/Analysis/LoopInfo.h"
  27. #include "llvm/Analysis/LoopPass.h"
  28. #include "llvm/Analysis/LoopUnrollAnalyzer.h"
  29. #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  30. #include "llvm/Analysis/ProfileSummaryInfo.h"
  31. #include "llvm/Analysis/ScalarEvolution.h"
  32. #include "llvm/Analysis/TargetTransformInfo.h"
  33. #include "llvm/IR/BasicBlock.h"
  34. #include "llvm/IR/CFG.h"
  35. #include "llvm/IR/Constant.h"
  36. #include "llvm/IR/Constants.h"
  37. #include "llvm/IR/DiagnosticInfo.h"
  38. #include "llvm/IR/Dominators.h"
  39. #include "llvm/IR/Function.h"
  40. #include "llvm/IR/Instruction.h"
  41. #include "llvm/IR/Instructions.h"
  42. #include "llvm/IR/IntrinsicInst.h"
  43. #include "llvm/IR/Metadata.h"
  44. #include "llvm/IR/PassManager.h"
  45. #include "llvm/InitializePasses.h"
  46. #include "llvm/Pass.h"
  47. #include "llvm/Support/Casting.h"
  48. #include "llvm/Support/CommandLine.h"
  49. #include "llvm/Support/Debug.h"
  50. #include "llvm/Support/ErrorHandling.h"
  51. #include "llvm/Support/raw_ostream.h"
  52. #include "llvm/Transforms/Scalar.h"
  53. #include "llvm/Transforms/Scalar/LoopPassManager.h"
  54. #include "llvm/Transforms/Utils.h"
  55. #include "llvm/Transforms/Utils/LoopPeel.h"
  56. #include "llvm/Transforms/Utils/LoopSimplify.h"
  57. #include "llvm/Transforms/Utils/LoopUtils.h"
  58. #include "llvm/Transforms/Utils/SizeOpts.h"
  59. #include "llvm/Transforms/Utils/UnrollLoop.h"
  60. #include <algorithm>
  61. #include <cassert>
  62. #include <cstdint>
  63. #include <limits>
  64. #include <optional>
  65. #include <string>
  66. #include <tuple>
  67. #include <utility>
  68. using namespace llvm;
  69. #define DEBUG_TYPE "loop-unroll"
  70. cl::opt<bool> llvm::ForgetSCEVInLoopUnroll(
  71. "forget-scev-loop-unroll", cl::init(false), cl::Hidden,
  72. cl::desc("Forget everything in SCEV when doing LoopUnroll, instead of just"
  73. " the current top-most loop. This is sometimes preferred to reduce"
  74. " compile time."));
  75. static cl::opt<unsigned>
  76. UnrollThreshold("unroll-threshold", cl::Hidden,
  77. cl::desc("The cost threshold for loop unrolling"));
  78. static cl::opt<unsigned>
  79. UnrollOptSizeThreshold(
  80. "unroll-optsize-threshold", cl::init(0), cl::Hidden,
  81. cl::desc("The cost threshold for loop unrolling when optimizing for "
  82. "size"));
  83. static cl::opt<unsigned> UnrollPartialThreshold(
  84. "unroll-partial-threshold", cl::Hidden,
  85. cl::desc("The cost threshold for partial loop unrolling"));
  86. static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
  87. "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
  88. cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied "
  89. "to the threshold when aggressively unrolling a loop due to the "
  90. "dynamic cost savings. If completely unrolling a loop will reduce "
  91. "the total runtime from X to Y, we boost the loop unroll "
  92. "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
  93. "X/Y). This limit avoids excessive code bloat."));
  94. static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
  95. "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
  96. cl::desc("Don't allow loop unrolling to simulate more than this number of"
  97. "iterations when checking full unroll profitability"));
  98. static cl::opt<unsigned> UnrollCount(
  99. "unroll-count", cl::Hidden,
  100. cl::desc("Use this unroll count for all loops including those with "
  101. "unroll_count pragma values, for testing purposes"));
  102. static cl::opt<unsigned> UnrollMaxCount(
  103. "unroll-max-count", cl::Hidden,
  104. cl::desc("Set the max unroll count for partial and runtime unrolling, for"
  105. "testing purposes"));
  106. static cl::opt<unsigned> UnrollFullMaxCount(
  107. "unroll-full-max-count", cl::Hidden,
  108. cl::desc(
  109. "Set the max unroll count for full unrolling, for testing purposes"));
  110. static cl::opt<bool>
  111. UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
  112. cl::desc("Allows loops to be partially unrolled until "
  113. "-unroll-threshold loop size is reached."));
  114. static cl::opt<bool> UnrollAllowRemainder(
  115. "unroll-allow-remainder", cl::Hidden,
  116. cl::desc("Allow generation of a loop remainder (extra iterations) "
  117. "when unrolling a loop."));
  118. static cl::opt<bool>
  119. UnrollRuntime("unroll-runtime", cl::Hidden,
  120. cl::desc("Unroll loops with run-time trip counts"));
  121. static cl::opt<unsigned> UnrollMaxUpperBound(
  122. "unroll-max-upperbound", cl::init(8), cl::Hidden,
  123. cl::desc(
  124. "The max of trip count upper bound that is considered in unrolling"));
  125. static cl::opt<unsigned> PragmaUnrollThreshold(
  126. "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
  127. cl::desc("Unrolled size limit for loops with an unroll(full) or "
  128. "unroll_count pragma."));
  129. static cl::opt<unsigned> FlatLoopTripCountThreshold(
  130. "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden,
  131. cl::desc("If the runtime tripcount for the loop is lower than the "
  132. "threshold, the loop is considered as flat and will be less "
  133. "aggressively unrolled."));
  134. static cl::opt<bool> UnrollUnrollRemainder(
  135. "unroll-remainder", cl::Hidden,
  136. cl::desc("Allow the loop remainder to be unrolled."));
  137. // This option isn't ever intended to be enabled, it serves to allow
  138. // experiments to check the assumptions about when this kind of revisit is
  139. // necessary.
  140. static cl::opt<bool> UnrollRevisitChildLoops(
  141. "unroll-revisit-child-loops", cl::Hidden,
  142. cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
  143. "This shouldn't typically be needed as child loops (or their "
  144. "clones) were already visited."));
  145. static cl::opt<unsigned> UnrollThresholdAggressive(
  146. "unroll-threshold-aggressive", cl::init(300), cl::Hidden,
  147. cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) "
  148. "optimizations"));
  149. static cl::opt<unsigned>
  150. UnrollThresholdDefault("unroll-threshold-default", cl::init(150),
  151. cl::Hidden,
  152. cl::desc("Default threshold (max size of unrolled "
  153. "loop), used in all but O3 optimizations"));
  154. /// A magic value for use with the Threshold parameter to indicate
  155. /// that the loop unroll should be performed regardless of how much
  156. /// code expansion would result.
  157. static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
  158. /// Gather the various unrolling parameters based on the defaults, compiler
  159. /// flags, TTI overrides and user specified parameters.
  160. TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
  161. Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
  162. BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
  163. OptimizationRemarkEmitter &ORE, int OptLevel,
  164. std::optional<unsigned> UserThreshold, std::optional<unsigned> UserCount,
  165. std::optional<bool> UserAllowPartial, std::optional<bool> UserRuntime,
  166. std::optional<bool> UserUpperBound,
  167. std::optional<unsigned> UserFullUnrollMaxCount) {
  168. TargetTransformInfo::UnrollingPreferences UP;
  169. // Set up the defaults
  170. UP.Threshold =
  171. OptLevel > 2 ? UnrollThresholdAggressive : UnrollThresholdDefault;
  172. UP.MaxPercentThresholdBoost = 400;
  173. UP.OptSizeThreshold = UnrollOptSizeThreshold;
  174. UP.PartialThreshold = 150;
  175. UP.PartialOptSizeThreshold = UnrollOptSizeThreshold;
  176. UP.Count = 0;
  177. UP.DefaultUnrollRuntimeCount = 8;
  178. UP.MaxCount = std::numeric_limits<unsigned>::max();
  179. UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
  180. UP.BEInsns = 2;
  181. UP.Partial = false;
  182. UP.Runtime = false;
  183. UP.AllowRemainder = true;
  184. UP.UnrollRemainder = false;
  185. UP.AllowExpensiveTripCount = false;
  186. UP.Force = false;
  187. UP.UpperBound = false;
  188. UP.UnrollAndJam = false;
  189. UP.UnrollAndJamInnerLoopThreshold = 60;
  190. UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
  191. // Override with any target specific settings
  192. TTI.getUnrollingPreferences(L, SE, UP, &ORE);
  193. // Apply size attributes
  194. bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
  195. // Let unroll hints / pragmas take precedence over PGSO.
  196. (hasUnrollTransformation(L) != TM_ForcedByUser &&
  197. llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
  198. PGSOQueryType::IRPass));
  199. if (OptForSize) {
  200. UP.Threshold = UP.OptSizeThreshold;
  201. UP.PartialThreshold = UP.PartialOptSizeThreshold;
  202. UP.MaxPercentThresholdBoost = 100;
  203. }
  204. // Apply any user values specified by cl::opt
  205. if (UnrollThreshold.getNumOccurrences() > 0)
  206. UP.Threshold = UnrollThreshold;
  207. if (UnrollPartialThreshold.getNumOccurrences() > 0)
  208. UP.PartialThreshold = UnrollPartialThreshold;
  209. if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
  210. UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
  211. if (UnrollMaxCount.getNumOccurrences() > 0)
  212. UP.MaxCount = UnrollMaxCount;
  213. if (UnrollFullMaxCount.getNumOccurrences() > 0)
  214. UP.FullUnrollMaxCount = UnrollFullMaxCount;
  215. if (UnrollAllowPartial.getNumOccurrences() > 0)
  216. UP.Partial = UnrollAllowPartial;
  217. if (UnrollAllowRemainder.getNumOccurrences() > 0)
  218. UP.AllowRemainder = UnrollAllowRemainder;
  219. if (UnrollRuntime.getNumOccurrences() > 0)
  220. UP.Runtime = UnrollRuntime;
  221. if (UnrollMaxUpperBound == 0)
  222. UP.UpperBound = false;
  223. if (UnrollUnrollRemainder.getNumOccurrences() > 0)
  224. UP.UnrollRemainder = UnrollUnrollRemainder;
  225. if (UnrollMaxIterationsCountToAnalyze.getNumOccurrences() > 0)
  226. UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
  227. // Apply user values provided by argument
  228. if (UserThreshold) {
  229. UP.Threshold = *UserThreshold;
  230. UP.PartialThreshold = *UserThreshold;
  231. }
  232. if (UserCount)
  233. UP.Count = *UserCount;
  234. if (UserAllowPartial)
  235. UP.Partial = *UserAllowPartial;
  236. if (UserRuntime)
  237. UP.Runtime = *UserRuntime;
  238. if (UserUpperBound)
  239. UP.UpperBound = *UserUpperBound;
  240. if (UserFullUnrollMaxCount)
  241. UP.FullUnrollMaxCount = *UserFullUnrollMaxCount;
  242. return UP;
  243. }
  244. namespace {
  245. /// A struct to densely store the state of an instruction after unrolling at
  246. /// each iteration.
  247. ///
  248. /// This is designed to work like a tuple of <Instruction *, int> for the
  249. /// purposes of hashing and lookup, but to be able to associate two boolean
  250. /// states with each key.
  251. struct UnrolledInstState {
  252. Instruction *I;
  253. int Iteration : 30;
  254. unsigned IsFree : 1;
  255. unsigned IsCounted : 1;
  256. };
  257. /// Hashing and equality testing for a set of the instruction states.
  258. struct UnrolledInstStateKeyInfo {
  259. using PtrInfo = DenseMapInfo<Instruction *>;
  260. using PairInfo = DenseMapInfo<std::pair<Instruction *, int>>;
  261. static inline UnrolledInstState getEmptyKey() {
  262. return {PtrInfo::getEmptyKey(), 0, 0, 0};
  263. }
  264. static inline UnrolledInstState getTombstoneKey() {
  265. return {PtrInfo::getTombstoneKey(), 0, 0, 0};
  266. }
  267. static inline unsigned getHashValue(const UnrolledInstState &S) {
  268. return PairInfo::getHashValue({S.I, S.Iteration});
  269. }
  270. static inline bool isEqual(const UnrolledInstState &LHS,
  271. const UnrolledInstState &RHS) {
  272. return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
  273. }
  274. };
  275. struct EstimatedUnrollCost {
  276. /// The estimated cost after unrolling.
  277. unsigned UnrolledCost;
  278. /// The estimated dynamic cost of executing the instructions in the
  279. /// rolled form.
  280. unsigned RolledDynamicCost;
  281. };
  282. struct PragmaInfo {
  283. PragmaInfo(bool UUC, bool PFU, unsigned PC, bool PEU)
  284. : UserUnrollCount(UUC), PragmaFullUnroll(PFU), PragmaCount(PC),
  285. PragmaEnableUnroll(PEU) {}
  286. const bool UserUnrollCount;
  287. const bool PragmaFullUnroll;
  288. const unsigned PragmaCount;
  289. const bool PragmaEnableUnroll;
  290. };
  291. } // end anonymous namespace
  292. /// Figure out if the loop is worth full unrolling.
  293. ///
  294. /// Complete loop unrolling can make some loads constant, and we need to know
  295. /// if that would expose any further optimization opportunities. This routine
  296. /// estimates this optimization. It computes cost of unrolled loop
  297. /// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By
  298. /// dynamic cost we mean that we won't count costs of blocks that are known not
  299. /// to be executed (i.e. if we have a branch in the loop and we know that at the
  300. /// given iteration its condition would be resolved to true, we won't add up the
  301. /// cost of the 'false'-block).
  302. /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
  303. /// the analysis failed (no benefits expected from the unrolling, or the loop is
  304. /// too big to analyze), the returned value is std::nullopt.
  305. static std::optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
  306. const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE,
  307. const SmallPtrSetImpl<const Value *> &EphValues,
  308. const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize,
  309. unsigned MaxIterationsCountToAnalyze) {
  310. // We want to be able to scale offsets by the trip count and add more offsets
  311. // to them without checking for overflows, and we already don't want to
  312. // analyze *massive* trip counts, so we force the max to be reasonably small.
  313. assert(MaxIterationsCountToAnalyze <
  314. (unsigned)(std::numeric_limits<int>::max() / 2) &&
  315. "The unroll iterations max is too large!");
  316. // Only analyze inner loops. We can't properly estimate cost of nested loops
  317. // and we won't visit inner loops again anyway.
  318. if (!L->isInnermost())
  319. return std::nullopt;
  320. // Don't simulate loops with a big or unknown tripcount
  321. if (!TripCount || TripCount > MaxIterationsCountToAnalyze)
  322. return std::nullopt;
  323. SmallSetVector<BasicBlock *, 16> BBWorklist;
  324. SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist;
  325. DenseMap<Value *, Value *> SimplifiedValues;
  326. SmallVector<std::pair<Value *, Value *>, 4> SimplifiedInputValues;
  327. // The estimated cost of the unrolled form of the loop. We try to estimate
  328. // this by simplifying as much as we can while computing the estimate.
  329. InstructionCost UnrolledCost = 0;
  330. // We also track the estimated dynamic (that is, actually executed) cost in
  331. // the rolled form. This helps identify cases when the savings from unrolling
  332. // aren't just exposing dead control flows, but actual reduced dynamic
  333. // instructions due to the simplifications which we expect to occur after
  334. // unrolling.
  335. InstructionCost RolledDynamicCost = 0;
  336. // We track the simplification of each instruction in each iteration. We use
  337. // this to recursively merge costs into the unrolled cost on-demand so that
  338. // we don't count the cost of any dead code. This is essentially a map from
  339. // <instruction, int> to <bool, bool>, but stored as a densely packed struct.
  340. DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap;
  341. // A small worklist used to accumulate cost of instructions from each
  342. // observable and reached root in the loop.
  343. SmallVector<Instruction *, 16> CostWorklist;
  344. // PHI-used worklist used between iterations while accumulating cost.
  345. SmallVector<Instruction *, 4> PHIUsedList;
  346. // Helper function to accumulate cost for instructions in the loop.
  347. auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
  348. assert(Iteration >= 0 && "Cannot have a negative iteration!");
  349. assert(CostWorklist.empty() && "Must start with an empty cost list");
  350. assert(PHIUsedList.empty() && "Must start with an empty phi used list");
  351. CostWorklist.push_back(&RootI);
  352. TargetTransformInfo::TargetCostKind CostKind =
  353. RootI.getFunction()->hasMinSize() ?
  354. TargetTransformInfo::TCK_CodeSize :
  355. TargetTransformInfo::TCK_SizeAndLatency;
  356. for (;; --Iteration) {
  357. do {
  358. Instruction *I = CostWorklist.pop_back_val();
  359. // InstCostMap only uses I and Iteration as a key, the other two values
  360. // don't matter here.
  361. auto CostIter = InstCostMap.find({I, Iteration, 0, 0});
  362. if (CostIter == InstCostMap.end())
  363. // If an input to a PHI node comes from a dead path through the loop
  364. // we may have no cost data for it here. What that actually means is
  365. // that it is free.
  366. continue;
  367. auto &Cost = *CostIter;
  368. if (Cost.IsCounted)
  369. // Already counted this instruction.
  370. continue;
  371. // Mark that we are counting the cost of this instruction now.
  372. Cost.IsCounted = true;
  373. // If this is a PHI node in the loop header, just add it to the PHI set.
  374. if (auto *PhiI = dyn_cast<PHINode>(I))
  375. if (PhiI->getParent() == L->getHeader()) {
  376. assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
  377. "inherently simplify during unrolling.");
  378. if (Iteration == 0)
  379. continue;
  380. // Push the incoming value from the backedge into the PHI used list
  381. // if it is an in-loop instruction. We'll use this to populate the
  382. // cost worklist for the next iteration (as we count backwards).
  383. if (auto *OpI = dyn_cast<Instruction>(
  384. PhiI->getIncomingValueForBlock(L->getLoopLatch())))
  385. if (L->contains(OpI))
  386. PHIUsedList.push_back(OpI);
  387. continue;
  388. }
  389. // First accumulate the cost of this instruction.
  390. if (!Cost.IsFree) {
  391. UnrolledCost += TTI.getInstructionCost(I, CostKind);
  392. LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration "
  393. << Iteration << "): ");
  394. LLVM_DEBUG(I->dump());
  395. }
  396. // We must count the cost of every operand which is not free,
  397. // recursively. If we reach a loop PHI node, simply add it to the set
  398. // to be considered on the next iteration (backwards!).
  399. for (Value *Op : I->operands()) {
  400. // Check whether this operand is free due to being a constant or
  401. // outside the loop.
  402. auto *OpI = dyn_cast<Instruction>(Op);
  403. if (!OpI || !L->contains(OpI))
  404. continue;
  405. // Otherwise accumulate its cost.
  406. CostWorklist.push_back(OpI);
  407. }
  408. } while (!CostWorklist.empty());
  409. if (PHIUsedList.empty())
  410. // We've exhausted the search.
  411. break;
  412. assert(Iteration > 0 &&
  413. "Cannot track PHI-used values past the first iteration!");
  414. CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());
  415. PHIUsedList.clear();
  416. }
  417. };
  418. // Ensure that we don't violate the loop structure invariants relied on by
  419. // this analysis.
  420. assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
  421. assert(L->isLCSSAForm(DT) &&
  422. "Must have loops in LCSSA form to track live-out values.");
  423. LLVM_DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
  424. TargetTransformInfo::TargetCostKind CostKind =
  425. L->getHeader()->getParent()->hasMinSize() ?
  426. TargetTransformInfo::TCK_CodeSize : TargetTransformInfo::TCK_SizeAndLatency;
  427. // Simulate execution of each iteration of the loop counting instructions,
  428. // which would be simplified.
  429. // Since the same load will take different values on different iterations,
  430. // we literally have to go through all loop's iterations.
  431. for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
  432. LLVM_DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
  433. // Prepare for the iteration by collecting any simplified entry or backedge
  434. // inputs.
  435. for (Instruction &I : *L->getHeader()) {
  436. auto *PHI = dyn_cast<PHINode>(&I);
  437. if (!PHI)
  438. break;
  439. // The loop header PHI nodes must have exactly two input: one from the
  440. // loop preheader and one from the loop latch.
  441. assert(
  442. PHI->getNumIncomingValues() == 2 &&
  443. "Must have an incoming value only for the preheader and the latch.");
  444. Value *V = PHI->getIncomingValueForBlock(
  445. Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
  446. if (Iteration != 0 && SimplifiedValues.count(V))
  447. V = SimplifiedValues.lookup(V);
  448. SimplifiedInputValues.push_back({PHI, V});
  449. }
  450. // Now clear and re-populate the map for the next iteration.
  451. SimplifiedValues.clear();
  452. while (!SimplifiedInputValues.empty())
  453. SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
  454. UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
  455. BBWorklist.clear();
  456. BBWorklist.insert(L->getHeader());
  457. // Note that we *must not* cache the size, this loop grows the worklist.
  458. for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
  459. BasicBlock *BB = BBWorklist[Idx];
  460. // Visit all instructions in the given basic block and try to simplify
  461. // it. We don't change the actual IR, just count optimization
  462. // opportunities.
  463. for (Instruction &I : *BB) {
  464. // These won't get into the final code - don't even try calculating the
  465. // cost for them.
  466. if (isa<DbgInfoIntrinsic>(I) || EphValues.count(&I))
  467. continue;
  468. // Track this instruction's expected baseline cost when executing the
  469. // rolled loop form.
  470. RolledDynamicCost += TTI.getInstructionCost(&I, CostKind);
  471. // Visit the instruction to analyze its loop cost after unrolling,
  472. // and if the visitor returns true, mark the instruction as free after
  473. // unrolling and continue.
  474. bool IsFree = Analyzer.visit(I);
  475. bool Inserted = InstCostMap.insert({&I, (int)Iteration,
  476. (unsigned)IsFree,
  477. /*IsCounted*/ false}).second;
  478. (void)Inserted;
  479. assert(Inserted && "Cannot have a state for an unvisited instruction!");
  480. if (IsFree)
  481. continue;
  482. // Can't properly model a cost of a call.
  483. // FIXME: With a proper cost model we should be able to do it.
  484. if (auto *CI = dyn_cast<CallInst>(&I)) {
  485. const Function *Callee = CI->getCalledFunction();
  486. if (!Callee || TTI.isLoweredToCall(Callee)) {
  487. LLVM_DEBUG(dbgs() << "Can't analyze cost of loop with call\n");
  488. return std::nullopt;
  489. }
  490. }
  491. // If the instruction might have a side-effect recursively account for
  492. // the cost of it and all the instructions leading up to it.
  493. if (I.mayHaveSideEffects())
  494. AddCostRecursively(I, Iteration);
  495. // If unrolled body turns out to be too big, bail out.
  496. if (UnrolledCost > MaxUnrolledLoopSize) {
  497. LLVM_DEBUG(dbgs() << " Exceeded threshold.. exiting.\n"
  498. << " UnrolledCost: " << UnrolledCost
  499. << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
  500. << "\n");
  501. return std::nullopt;
  502. }
  503. }
  504. Instruction *TI = BB->getTerminator();
  505. auto getSimplifiedConstant = [&](Value *V) -> Constant * {
  506. if (SimplifiedValues.count(V))
  507. V = SimplifiedValues.lookup(V);
  508. return dyn_cast<Constant>(V);
  509. };
  510. // Add in the live successors by first checking whether we have terminator
  511. // that may be simplified based on the values simplified by this call.
  512. BasicBlock *KnownSucc = nullptr;
  513. if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
  514. if (BI->isConditional()) {
  515. if (auto *SimpleCond = getSimplifiedConstant(BI->getCondition())) {
  516. // Just take the first successor if condition is undef
  517. if (isa<UndefValue>(SimpleCond))
  518. KnownSucc = BI->getSuccessor(0);
  519. else if (ConstantInt *SimpleCondVal =
  520. dyn_cast<ConstantInt>(SimpleCond))
  521. KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);
  522. }
  523. }
  524. } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
  525. if (auto *SimpleCond = getSimplifiedConstant(SI->getCondition())) {
  526. // Just take the first successor if condition is undef
  527. if (isa<UndefValue>(SimpleCond))
  528. KnownSucc = SI->getSuccessor(0);
  529. else if (ConstantInt *SimpleCondVal =
  530. dyn_cast<ConstantInt>(SimpleCond))
  531. KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
  532. }
  533. }
  534. if (KnownSucc) {
  535. if (L->contains(KnownSucc))
  536. BBWorklist.insert(KnownSucc);
  537. else
  538. ExitWorklist.insert({BB, KnownSucc});
  539. continue;
  540. }
  541. // Add BB's successors to the worklist.
  542. for (BasicBlock *Succ : successors(BB))
  543. if (L->contains(Succ))
  544. BBWorklist.insert(Succ);
  545. else
  546. ExitWorklist.insert({BB, Succ});
  547. AddCostRecursively(*TI, Iteration);
  548. }
  549. // If we found no optimization opportunities on the first iteration, we
  550. // won't find them on later ones too.
  551. if (UnrolledCost == RolledDynamicCost) {
  552. LLVM_DEBUG(dbgs() << " No opportunities found.. exiting.\n"
  553. << " UnrolledCost: " << UnrolledCost << "\n");
  554. return std::nullopt;
  555. }
  556. }
  557. while (!ExitWorklist.empty()) {
  558. BasicBlock *ExitingBB, *ExitBB;
  559. std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();
  560. for (Instruction &I : *ExitBB) {
  561. auto *PN = dyn_cast<PHINode>(&I);
  562. if (!PN)
  563. break;
  564. Value *Op = PN->getIncomingValueForBlock(ExitingBB);
  565. if (auto *OpI = dyn_cast<Instruction>(Op))
  566. if (L->contains(OpI))
  567. AddCostRecursively(*OpI, TripCount - 1);
  568. }
  569. }
  570. assert(UnrolledCost.isValid() && RolledDynamicCost.isValid() &&
  571. "All instructions must have a valid cost, whether the "
  572. "loop is rolled or unrolled.");
  573. LLVM_DEBUG(dbgs() << "Analysis finished:\n"
  574. << "UnrolledCost: " << UnrolledCost << ", "
  575. << "RolledDynamicCost: " << RolledDynamicCost << "\n");
  576. return {{unsigned(*UnrolledCost.getValue()),
  577. unsigned(*RolledDynamicCost.getValue())}};
  578. }
  579. /// ApproximateLoopSize - Approximate the size of the loop.
  580. InstructionCost llvm::ApproximateLoopSize(
  581. const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent,
  582. const TargetTransformInfo &TTI,
  583. const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
  584. CodeMetrics Metrics;
  585. for (BasicBlock *BB : L->blocks())
  586. Metrics.analyzeBasicBlock(BB, TTI, EphValues);
  587. NumCalls = Metrics.NumInlineCandidates;
  588. NotDuplicatable = Metrics.notDuplicatable;
  589. Convergent = Metrics.convergent;
  590. InstructionCost LoopSize = Metrics.NumInsts;
  591. // Don't allow an estimate of size zero. This would allows unrolling of loops
  592. // with huge iteration counts, which is a compile time problem even if it's
  593. // not a problem for code quality. Also, the code using this size may assume
  594. // that each loop has at least three instructions (likely a conditional
  595. // branch, a comparison feeding that branch, and some kind of loop increment
  596. // feeding that comparison instruction).
  597. if (LoopSize.isValid() && LoopSize < BEInsns + 1)
  598. // This is an open coded max() on InstructionCost
  599. LoopSize = BEInsns + 1;
  600. return LoopSize;
  601. }
  602. // Returns the loop hint metadata node with the given name (for example,
  603. // "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is
  604. // returned.
  605. static MDNode *getUnrollMetadataForLoop(const Loop *L, StringRef Name) {
  606. if (MDNode *LoopID = L->getLoopID())
  607. return GetUnrollMetadata(LoopID, Name);
  608. return nullptr;
  609. }
  610. // Returns true if the loop has an unroll(full) pragma.
  611. static bool hasUnrollFullPragma(const Loop *L) {
  612. return getUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
  613. }
  614. // Returns true if the loop has an unroll(enable) pragma. This metadata is used
  615. // for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
  616. static bool hasUnrollEnablePragma(const Loop *L) {
  617. return getUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
  618. }
  619. // Returns true if the loop has an runtime unroll(disable) pragma.
  620. static bool hasRuntimeUnrollDisablePragma(const Loop *L) {
  621. return getUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
  622. }
  623. // If loop has an unroll_count pragma return the (necessarily
  624. // positive) value from the pragma. Otherwise return 0.
  625. static unsigned unrollCountPragmaValue(const Loop *L) {
  626. MDNode *MD = getUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
  627. if (MD) {
  628. assert(MD->getNumOperands() == 2 &&
  629. "Unroll count hint metadata should have two operands.");
  630. unsigned Count =
  631. mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
  632. assert(Count >= 1 && "Unroll count must be positive.");
  633. return Count;
  634. }
  635. return 0;
  636. }
  637. // Computes the boosting factor for complete unrolling.
  638. // If fully unrolling the loop would save a lot of RolledDynamicCost, it would
  639. // be beneficial to fully unroll the loop even if unrolledcost is large. We
  640. // use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust
  641. // the unroll threshold.
  642. static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
  643. unsigned MaxPercentThresholdBoost) {
  644. if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100)
  645. return 100;
  646. else if (Cost.UnrolledCost != 0)
  647. // The boosting factor is RolledDynamicCost / UnrolledCost
  648. return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost,
  649. MaxPercentThresholdBoost);
  650. else
  651. return MaxPercentThresholdBoost;
  652. }
  653. // Produce an estimate of the unrolled cost of the specified loop. This
  654. // is used to a) produce a cost estimate for partial unrolling and b) to
  655. // cheaply estimate cost for full unrolling when we don't want to symbolically
  656. // evaluate all iterations.
  657. class UnrollCostEstimator {
  658. const unsigned LoopSize;
  659. public:
  660. UnrollCostEstimator(Loop &L, unsigned LoopSize) : LoopSize(LoopSize) {}
  661. // Returns loop size estimation for unrolled loop, given the unrolling
  662. // configuration specified by UP.
  663. uint64_t
  664. getUnrolledLoopSize(const TargetTransformInfo::UnrollingPreferences &UP,
  665. const unsigned CountOverwrite = 0) const {
  666. assert(LoopSize >= UP.BEInsns &&
  667. "LoopSize should not be less than BEInsns!");
  668. if (CountOverwrite)
  669. return static_cast<uint64_t>(LoopSize - UP.BEInsns) * CountOverwrite +
  670. UP.BEInsns;
  671. else
  672. return static_cast<uint64_t>(LoopSize - UP.BEInsns) * UP.Count +
  673. UP.BEInsns;
  674. }
  675. };
  676. static std::optional<unsigned>
  677. shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo,
  678. const unsigned TripMultiple, const unsigned TripCount,
  679. const UnrollCostEstimator UCE,
  680. const TargetTransformInfo::UnrollingPreferences &UP) {
  681. // Using unroll pragma
  682. // 1st priority is unroll count set by "unroll-count" option.
  683. if (PInfo.UserUnrollCount) {
  684. if (UP.AllowRemainder &&
  685. UCE.getUnrolledLoopSize(UP, (unsigned)UnrollCount) < UP.Threshold)
  686. return (unsigned)UnrollCount;
  687. }
  688. // 2nd priority is unroll count set by pragma.
  689. if (PInfo.PragmaCount > 0) {
  690. if ((UP.AllowRemainder || (TripMultiple % PInfo.PragmaCount == 0)))
  691. return PInfo.PragmaCount;
  692. }
  693. if (PInfo.PragmaFullUnroll && TripCount != 0)
  694. return TripCount;
  695. // if didn't return until here, should continue to other priorties
  696. return std::nullopt;
  697. }
  698. static std::optional<unsigned> shouldFullUnroll(
  699. Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT,
  700. ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
  701. const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE,
  702. const TargetTransformInfo::UnrollingPreferences &UP) {
  703. assert(FullUnrollTripCount && "should be non-zero!");
  704. if (FullUnrollTripCount > UP.FullUnrollMaxCount)
  705. return std::nullopt;
  706. // When computing the unrolled size, note that BEInsns are not replicated
  707. // like the rest of the loop body.
  708. if (UCE.getUnrolledLoopSize(UP) < UP.Threshold)
  709. return FullUnrollTripCount;
  710. // The loop isn't that small, but we still can fully unroll it if that
  711. // helps to remove a significant number of instructions.
  712. // To check that, run additional analysis on the loop.
  713. if (std::optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
  714. L, FullUnrollTripCount, DT, SE, EphValues, TTI,
  715. UP.Threshold * UP.MaxPercentThresholdBoost / 100,
  716. UP.MaxIterationsCountToAnalyze)) {
  717. unsigned Boost =
  718. getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
  719. if (Cost->UnrolledCost < UP.Threshold * Boost / 100)
  720. return FullUnrollTripCount;
  721. }
  722. return std::nullopt;
  723. }
  724. static std::optional<unsigned>
  725. shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount,
  726. const UnrollCostEstimator UCE,
  727. const TargetTransformInfo::UnrollingPreferences &UP) {
  728. if (!TripCount)
  729. return std::nullopt;
  730. if (!UP.Partial) {
  731. LLVM_DEBUG(dbgs() << " will not try to unroll partially because "
  732. << "-unroll-allow-partial not given\n");
  733. return 0;
  734. }
  735. unsigned count = UP.Count;
  736. if (count == 0)
  737. count = TripCount;
  738. if (UP.PartialThreshold != NoThreshold) {
  739. // Reduce unroll count to be modulo of TripCount for partial unrolling.
  740. if (UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold)
  741. count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
  742. (LoopSize - UP.BEInsns);
  743. if (count > UP.MaxCount)
  744. count = UP.MaxCount;
  745. while (count != 0 && TripCount % count != 0)
  746. count--;
  747. if (UP.AllowRemainder && count <= 1) {
  748. // If there is no Count that is modulo of TripCount, set Count to
  749. // largest power-of-two factor that satisfies the threshold limit.
  750. // As we'll create fixup loop, do the type of unrolling only if
  751. // remainder loop is allowed.
  752. count = UP.DefaultUnrollRuntimeCount;
  753. while (count != 0 &&
  754. UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold)
  755. count >>= 1;
  756. }
  757. if (count < 2) {
  758. count = 0;
  759. }
  760. } else {
  761. count = TripCount;
  762. }
  763. if (count > UP.MaxCount)
  764. count = UP.MaxCount;
  765. LLVM_DEBUG(dbgs() << " partially unrolling with count: " << count << "\n");
  766. return count;
  767. }
  768. // Returns true if unroll count was set explicitly.
  769. // Calculates unroll count and writes it to UP.Count.
  770. // Unless IgnoreUser is true, will also use metadata and command-line options
  771. // that are specific to to the LoopUnroll pass (which, for instance, are
  772. // irrelevant for the LoopUnrollAndJam pass).
  773. // FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes
  774. // many LoopUnroll-specific options. The shared functionality should be
  775. // refactored into it own function.
  776. bool llvm::computeUnrollCount(
  777. Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
  778. AssumptionCache *AC,
  779. ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
  780. OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount,
  781. bool MaxOrZero, unsigned TripMultiple, unsigned LoopSize,
  782. TargetTransformInfo::UnrollingPreferences &UP,
  783. TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) {
  784. UnrollCostEstimator UCE(*L, LoopSize);
  785. const bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
  786. const bool PragmaFullUnroll = hasUnrollFullPragma(L);
  787. const unsigned PragmaCount = unrollCountPragmaValue(L);
  788. const bool PragmaEnableUnroll = hasUnrollEnablePragma(L);
  789. const bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
  790. PragmaEnableUnroll || UserUnrollCount;
  791. PragmaInfo PInfo(UserUnrollCount, PragmaFullUnroll, PragmaCount,
  792. PragmaEnableUnroll);
  793. // Use an explicit peel count that has been specified for testing. In this
  794. // case it's not permitted to also specify an explicit unroll count.
  795. if (PP.PeelCount) {
  796. if (UnrollCount.getNumOccurrences() > 0) {
  797. report_fatal_error("Cannot specify both explicit peel count and "
  798. "explicit unroll count", /*GenCrashDiag=*/false);
  799. }
  800. UP.Count = 1;
  801. UP.Runtime = false;
  802. return true;
  803. }
  804. // Check for explicit Count.
  805. // 1st priority is unroll count set by "unroll-count" option.
  806. // 2nd priority is unroll count set by pragma.
  807. if (auto UnrollFactor = shouldPragmaUnroll(L, PInfo, TripMultiple, TripCount,
  808. UCE, UP)) {
  809. UP.Count = *UnrollFactor;
  810. if (UserUnrollCount || (PragmaCount > 0)) {
  811. UP.AllowExpensiveTripCount = true;
  812. UP.Force = true;
  813. }
  814. UP.Runtime |= (PragmaCount > 0);
  815. return ExplicitUnroll;
  816. } else {
  817. if (ExplicitUnroll && TripCount != 0) {
  818. // If the loop has an unrolling pragma, we want to be more aggressive with
  819. // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
  820. // value which is larger than the default limits.
  821. UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
  822. UP.PartialThreshold =
  823. std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
  824. }
  825. }
  826. // 3rd priority is exact full unrolling. This will eliminate all copies
  827. // of some exit test.
  828. UP.Count = 0;
  829. if (TripCount) {
  830. UP.Count = TripCount;
  831. if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
  832. TripCount, UCE, UP)) {
  833. UP.Count = *UnrollFactor;
  834. UseUpperBound = false;
  835. return ExplicitUnroll;
  836. }
  837. }
  838. // 4th priority is bounded unrolling.
  839. // We can unroll by the upper bound amount if it's generally allowed or if
  840. // we know that the loop is executed either the upper bound or zero times.
  841. // (MaxOrZero unrolling keeps only the first loop test, so the number of
  842. // loop tests remains the same compared to the non-unrolled version, whereas
  843. // the generic upper bound unrolling keeps all but the last loop test so the
  844. // number of loop tests goes up which may end up being worse on targets with
  845. // constrained branch predictor resources so is controlled by an option.)
  846. // In addition we only unroll small upper bounds.
  847. // Note that the cost of bounded unrolling is always strictly greater than
  848. // cost of exact full unrolling. As such, if we have an exact count and
  849. // found it unprofitable, we'll never chose to bounded unroll.
  850. if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) &&
  851. MaxTripCount <= UnrollMaxUpperBound) {
  852. UP.Count = MaxTripCount;
  853. if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
  854. MaxTripCount, UCE, UP)) {
  855. UP.Count = *UnrollFactor;
  856. UseUpperBound = true;
  857. return ExplicitUnroll;
  858. }
  859. }
  860. // 5th priority is loop peeling.
  861. computePeelCount(L, LoopSize, PP, TripCount, DT, SE, AC, UP.Threshold);
  862. if (PP.PeelCount) {
  863. UP.Runtime = false;
  864. UP.Count = 1;
  865. return ExplicitUnroll;
  866. }
  867. // Before starting partial unrolling, set up.partial to true,
  868. // if user explicitly asked for unrolling
  869. if (TripCount)
  870. UP.Partial |= ExplicitUnroll;
  871. // 6th priority is partial unrolling.
  872. // Try partial unroll only when TripCount could be statically calculated.
  873. if (auto UnrollFactor = shouldPartialUnroll(LoopSize, TripCount, UCE, UP)) {
  874. UP.Count = *UnrollFactor;
  875. if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
  876. UP.Count != TripCount)
  877. ORE->emit([&]() {
  878. return OptimizationRemarkMissed(DEBUG_TYPE,
  879. "FullUnrollAsDirectedTooLarge",
  880. L->getStartLoc(), L->getHeader())
  881. << "Unable to fully unroll loop as directed by unroll pragma "
  882. "because "
  883. "unrolled size is too large.";
  884. });
  885. if (UP.PartialThreshold != NoThreshold) {
  886. if (UP.Count == 0) {
  887. if (PragmaEnableUnroll)
  888. ORE->emit([&]() {
  889. return OptimizationRemarkMissed(DEBUG_TYPE,
  890. "UnrollAsDirectedTooLarge",
  891. L->getStartLoc(), L->getHeader())
  892. << "Unable to unroll loop as directed by unroll(enable) "
  893. "pragma "
  894. "because unrolled size is too large.";
  895. });
  896. }
  897. }
  898. return ExplicitUnroll;
  899. }
  900. assert(TripCount == 0 &&
  901. "All cases when TripCount is constant should be covered here.");
  902. if (PragmaFullUnroll)
  903. ORE->emit([&]() {
  904. return OptimizationRemarkMissed(
  905. DEBUG_TYPE, "CantFullUnrollAsDirectedRuntimeTripCount",
  906. L->getStartLoc(), L->getHeader())
  907. << "Unable to fully unroll loop as directed by unroll(full) "
  908. "pragma "
  909. "because loop has a runtime trip count.";
  910. });
  911. // 7th priority is runtime unrolling.
  912. // Don't unroll a runtime trip count loop when it is disabled.
  913. if (hasRuntimeUnrollDisablePragma(L)) {
  914. UP.Count = 0;
  915. return false;
  916. }
  917. // Don't unroll a small upper bound loop unless user or TTI asked to do so.
  918. if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) {
  919. UP.Count = 0;
  920. return false;
  921. }
  922. // Check if the runtime trip count is too small when profile is available.
  923. if (L->getHeader()->getParent()->hasProfileData()) {
  924. if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
  925. if (*ProfileTripCount < FlatLoopTripCountThreshold)
  926. return false;
  927. else
  928. UP.AllowExpensiveTripCount = true;
  929. }
  930. }
  931. UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
  932. if (!UP.Runtime) {
  933. LLVM_DEBUG(
  934. dbgs() << " will not try to unroll loop with runtime trip count "
  935. << "-unroll-runtime not given\n");
  936. UP.Count = 0;
  937. return false;
  938. }
  939. if (UP.Count == 0)
  940. UP.Count = UP.DefaultUnrollRuntimeCount;
  941. // Reduce unroll count to be the largest power-of-two factor of
  942. // the original count which satisfies the threshold limit.
  943. while (UP.Count != 0 &&
  944. UCE.getUnrolledLoopSize(UP) > UP.PartialThreshold)
  945. UP.Count >>= 1;
  946. #ifndef NDEBUG
  947. unsigned OrigCount = UP.Count;
  948. #endif
  949. if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
  950. while (UP.Count != 0 && TripMultiple % UP.Count != 0)
  951. UP.Count >>= 1;
  952. LLVM_DEBUG(
  953. dbgs() << "Remainder loop is restricted (that could architecture "
  954. "specific or because the loop contains a convergent "
  955. "instruction), so unroll count must divide the trip "
  956. "multiple, "
  957. << TripMultiple << ". Reducing unroll count from " << OrigCount
  958. << " to " << UP.Count << ".\n");
  959. using namespace ore;
  960. if (unrollCountPragmaValue(L) > 0 && !UP.AllowRemainder)
  961. ORE->emit([&]() {
  962. return OptimizationRemarkMissed(DEBUG_TYPE,
  963. "DifferentUnrollCountFromDirected",
  964. L->getStartLoc(), L->getHeader())
  965. << "Unable to unroll loop the number of times directed by "
  966. "unroll_count pragma because remainder loop is restricted "
  967. "(that could architecture specific or because the loop "
  968. "contains a convergent instruction) and so must have an "
  969. "unroll "
  970. "count that divides the loop trip multiple of "
  971. << NV("TripMultiple", TripMultiple) << ". Unrolling instead "
  972. << NV("UnrollCount", UP.Count) << " time(s).";
  973. });
  974. }
  975. if (UP.Count > UP.MaxCount)
  976. UP.Count = UP.MaxCount;
  977. if (MaxTripCount && UP.Count > MaxTripCount)
  978. UP.Count = MaxTripCount;
  979. LLVM_DEBUG(dbgs() << " runtime unrolling with count: " << UP.Count
  980. << "\n");
  981. if (UP.Count < 2)
  982. UP.Count = 0;
  983. return ExplicitUnroll;
  984. }
  985. static LoopUnrollResult
  986. tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
  987. const TargetTransformInfo &TTI, AssumptionCache &AC,
  988. OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
  989. ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel,
  990. bool OnlyWhenForced, bool ForgetAllSCEV,
  991. std::optional<unsigned> ProvidedCount,
  992. std::optional<unsigned> ProvidedThreshold,
  993. std::optional<bool> ProvidedAllowPartial,
  994. std::optional<bool> ProvidedRuntime,
  995. std::optional<bool> ProvidedUpperBound,
  996. std::optional<bool> ProvidedAllowPeeling,
  997. std::optional<bool> ProvidedAllowProfileBasedPeeling,
  998. std::optional<unsigned> ProvidedFullUnrollMaxCount) {
  999. LLVM_DEBUG(dbgs() << "Loop Unroll: F["
  1000. << L->getHeader()->getParent()->getName() << "] Loop %"
  1001. << L->getHeader()->getName() << "\n");
  1002. TransformationMode TM = hasUnrollTransformation(L);
  1003. if (TM & TM_Disable)
  1004. return LoopUnrollResult::Unmodified;
  1005. // If this loop isn't forced to be unrolled, avoid unrolling it when the
  1006. // parent loop has an explicit unroll-and-jam pragma. This is to prevent
  1007. // automatic unrolling from interfering with the user requested
  1008. // transformation.
  1009. Loop *ParentL = L->getParentLoop();
  1010. if (ParentL != nullptr &&
  1011. hasUnrollAndJamTransformation(ParentL) == TM_ForcedByUser &&
  1012. hasUnrollTransformation(L) != TM_ForcedByUser) {
  1013. LLVM_DEBUG(dbgs() << "Not unrolling loop since parent loop has"
  1014. << " llvm.loop.unroll_and_jam.\n");
  1015. return LoopUnrollResult::Unmodified;
  1016. }
  1017. // If this loop isn't forced to be unrolled, avoid unrolling it when the
  1018. // loop has an explicit unroll-and-jam pragma. This is to prevent automatic
  1019. // unrolling from interfering with the user requested transformation.
  1020. if (hasUnrollAndJamTransformation(L) == TM_ForcedByUser &&
  1021. hasUnrollTransformation(L) != TM_ForcedByUser) {
  1022. LLVM_DEBUG(
  1023. dbgs()
  1024. << " Not unrolling loop since it has llvm.loop.unroll_and_jam.\n");
  1025. return LoopUnrollResult::Unmodified;
  1026. }
  1027. if (!L->isLoopSimplifyForm()) {
  1028. LLVM_DEBUG(
  1029. dbgs() << " Not unrolling loop which is not in loop-simplify form.\n");
  1030. return LoopUnrollResult::Unmodified;
  1031. }
  1032. // When automatic unrolling is disabled, do not unroll unless overridden for
  1033. // this loop.
  1034. if (OnlyWhenForced && !(TM & TM_Enable))
  1035. return LoopUnrollResult::Unmodified;
  1036. bool OptForSize = L->getHeader()->getParent()->hasOptSize();
  1037. unsigned NumInlineCandidates;
  1038. bool NotDuplicatable;
  1039. bool Convergent;
  1040. TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
  1041. L, SE, TTI, BFI, PSI, ORE, OptLevel, ProvidedThreshold, ProvidedCount,
  1042. ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
  1043. ProvidedFullUnrollMaxCount);
  1044. TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences(
  1045. L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, true);
  1046. // Exit early if unrolling is disabled. For OptForSize, we pick the loop size
  1047. // as threshold later on.
  1048. if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0) &&
  1049. !OptForSize)
  1050. return LoopUnrollResult::Unmodified;
  1051. SmallPtrSet<const Value *, 32> EphValues;
  1052. CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
  1053. InstructionCost LoopSizeIC =
  1054. ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
  1055. TTI, EphValues, UP.BEInsns);
  1056. LLVM_DEBUG(dbgs() << " Loop Size = " << LoopSizeIC << "\n");
  1057. if (!LoopSizeIC.isValid()) {
  1058. LLVM_DEBUG(dbgs() << " Not unrolling loop which contains instructions"
  1059. << " with invalid cost.\n");
  1060. return LoopUnrollResult::Unmodified;
  1061. }
  1062. unsigned LoopSize = *LoopSizeIC.getValue();
  1063. if (NotDuplicatable) {
  1064. LLVM_DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable"
  1065. << " instructions.\n");
  1066. return LoopUnrollResult::Unmodified;
  1067. }
  1068. // When optimizing for size, use LoopSize + 1 as threshold (we use < Threshold
  1069. // later), to (fully) unroll loops, if it does not increase code size.
  1070. if (OptForSize)
  1071. UP.Threshold = std::max(UP.Threshold, LoopSize + 1);
  1072. if (NumInlineCandidates != 0) {
  1073. LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
  1074. return LoopUnrollResult::Unmodified;
  1075. }
  1076. // Find the smallest exact trip count for any exit. This is an upper bound
  1077. // on the loop trip count, but an exit at an earlier iteration is still
  1078. // possible. An unroll by the smallest exact trip count guarantees that all
  1079. // branches relating to at least one exit can be eliminated. This is unlike
  1080. // the max trip count, which only guarantees that the backedge can be broken.
  1081. unsigned TripCount = 0;
  1082. unsigned TripMultiple = 1;
  1083. SmallVector<BasicBlock *, 8> ExitingBlocks;
  1084. L->getExitingBlocks(ExitingBlocks);
  1085. for (BasicBlock *ExitingBlock : ExitingBlocks)
  1086. if (unsigned TC = SE.getSmallConstantTripCount(L, ExitingBlock))
  1087. if (!TripCount || TC < TripCount)
  1088. TripCount = TripMultiple = TC;
  1089. if (!TripCount) {
  1090. // If no exact trip count is known, determine the trip multiple of either
  1091. // the loop latch or the single exiting block.
  1092. // TODO: Relax for multiple exits.
  1093. BasicBlock *ExitingBlock = L->getLoopLatch();
  1094. if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
  1095. ExitingBlock = L->getExitingBlock();
  1096. if (ExitingBlock)
  1097. TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
  1098. }
  1099. // If the loop contains a convergent operation, the prelude we'd add
  1100. // to do the first few instructions before we hit the unrolled loop
  1101. // is unsafe -- it adds a control-flow dependency to the convergent
  1102. // operation. Therefore restrict remainder loop (try unrolling without).
  1103. //
  1104. // TODO: This is quite conservative. In practice, convergent_op()
  1105. // is likely to be called unconditionally in the loop. In this
  1106. // case, the program would be ill-formed (on most architectures)
  1107. // unless n were the same on all threads in a thread group.
  1108. // Assuming n is the same on all threads, any kind of unrolling is
  1109. // safe. But currently llvm's notion of convergence isn't powerful
  1110. // enough to express this.
  1111. if (Convergent)
  1112. UP.AllowRemainder = false;
  1113. // Try to find the trip count upper bound if we cannot find the exact trip
  1114. // count.
  1115. unsigned MaxTripCount = 0;
  1116. bool MaxOrZero = false;
  1117. if (!TripCount) {
  1118. MaxTripCount = SE.getSmallConstantMaxTripCount(L);
  1119. MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
  1120. }
  1121. // computeUnrollCount() decides whether it is beneficial to use upper bound to
  1122. // fully unroll the loop.
  1123. bool UseUpperBound = false;
  1124. bool IsCountSetExplicitly = computeUnrollCount(
  1125. L, TTI, DT, LI, &AC, SE, EphValues, &ORE, TripCount, MaxTripCount, MaxOrZero,
  1126. TripMultiple, LoopSize, UP, PP, UseUpperBound);
  1127. if (!UP.Count)
  1128. return LoopUnrollResult::Unmodified;
  1129. if (PP.PeelCount) {
  1130. assert(UP.Count == 1 && "Cannot perform peel and unroll in the same step");
  1131. LLVM_DEBUG(dbgs() << "PEELING loop %" << L->getHeader()->getName()
  1132. << " with iteration count " << PP.PeelCount << "!\n");
  1133. ORE.emit([&]() {
  1134. return OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(),
  1135. L->getHeader())
  1136. << " peeled loop by " << ore::NV("PeelCount", PP.PeelCount)
  1137. << " iterations";
  1138. });
  1139. ValueToValueMapTy VMap;
  1140. if (peelLoop(L, PP.PeelCount, LI, &SE, DT, &AC, PreserveLCSSA, VMap)) {
  1141. simplifyLoopAfterUnroll(L, true, LI, &SE, &DT, &AC, &TTI);
  1142. // If the loop was peeled, we already "used up" the profile information
  1143. // we had, so we don't want to unroll or peel again.
  1144. if (PP.PeelProfiledIterations)
  1145. L->setLoopAlreadyUnrolled();
  1146. return LoopUnrollResult::PartiallyUnrolled;
  1147. }
  1148. return LoopUnrollResult::Unmodified;
  1149. }
  1150. // At this point, UP.Runtime indicates that run-time unrolling is allowed.
  1151. // However, we only want to actually perform it if we don't know the trip
  1152. // count and the unroll count doesn't divide the known trip multiple.
  1153. // TODO: This decision should probably be pushed up into
  1154. // computeUnrollCount().
  1155. UP.Runtime &= TripCount == 0 && TripMultiple % UP.Count != 0;
  1156. // Save loop properties before it is transformed.
  1157. MDNode *OrigLoopID = L->getLoopID();
  1158. // Unroll the loop.
  1159. Loop *RemainderLoop = nullptr;
  1160. LoopUnrollResult UnrollResult = UnrollLoop(
  1161. L,
  1162. {UP.Count, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
  1163. UP.UnrollRemainder, ForgetAllSCEV},
  1164. LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop);
  1165. if (UnrollResult == LoopUnrollResult::Unmodified)
  1166. return LoopUnrollResult::Unmodified;
  1167. if (RemainderLoop) {
  1168. std::optional<MDNode *> RemainderLoopID =
  1169. makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
  1170. LLVMLoopUnrollFollowupRemainder});
  1171. if (RemainderLoopID)
  1172. RemainderLoop->setLoopID(*RemainderLoopID);
  1173. }
  1174. if (UnrollResult != LoopUnrollResult::FullyUnrolled) {
  1175. std::optional<MDNode *> NewLoopID =
  1176. makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
  1177. LLVMLoopUnrollFollowupUnrolled});
  1178. if (NewLoopID) {
  1179. L->setLoopID(*NewLoopID);
  1180. // Do not setLoopAlreadyUnrolled if loop attributes have been specified
  1181. // explicitly.
  1182. return UnrollResult;
  1183. }
  1184. }
  1185. // If loop has an unroll count pragma or unrolled by explicitly set count
  1186. // mark loop as unrolled to prevent unrolling beyond that requested.
  1187. if (UnrollResult != LoopUnrollResult::FullyUnrolled && IsCountSetExplicitly)
  1188. L->setLoopAlreadyUnrolled();
  1189. return UnrollResult;
  1190. }
  1191. namespace {
  1192. class LoopUnroll : public LoopPass {
  1193. public:
  1194. static char ID; // Pass ID, replacement for typeid
  1195. int OptLevel;
  1196. /// If false, use a cost model to determine whether unrolling of a loop is
  1197. /// profitable. If true, only loops that explicitly request unrolling via
  1198. /// metadata are considered. All other loops are skipped.
  1199. bool OnlyWhenForced;
  1200. /// If false, when SCEV is invalidated, only forget everything in the
  1201. /// top-most loop (call forgetTopMostLoop), of the loop being processed.
  1202. /// Otherwise, forgetAllLoops and rebuild when needed next.
  1203. bool ForgetAllSCEV;
  1204. std::optional<unsigned> ProvidedCount;
  1205. std::optional<unsigned> ProvidedThreshold;
  1206. std::optional<bool> ProvidedAllowPartial;
  1207. std::optional<bool> ProvidedRuntime;
  1208. std::optional<bool> ProvidedUpperBound;
  1209. std::optional<bool> ProvidedAllowPeeling;
  1210. std::optional<bool> ProvidedAllowProfileBasedPeeling;
  1211. std::optional<unsigned> ProvidedFullUnrollMaxCount;
  1212. LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false,
  1213. bool ForgetAllSCEV = false,
  1214. std::optional<unsigned> Threshold = std::nullopt,
  1215. std::optional<unsigned> Count = std::nullopt,
  1216. std::optional<bool> AllowPartial = std::nullopt,
  1217. std::optional<bool> Runtime = std::nullopt,
  1218. std::optional<bool> UpperBound = std::nullopt,
  1219. std::optional<bool> AllowPeeling = std::nullopt,
  1220. std::optional<bool> AllowProfileBasedPeeling = std::nullopt,
  1221. std::optional<unsigned> ProvidedFullUnrollMaxCount = std::nullopt)
  1222. : LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
  1223. ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(std::move(Count)),
  1224. ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
  1225. ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
  1226. ProvidedAllowPeeling(AllowPeeling),
  1227. ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling),
  1228. ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) {
  1229. initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
  1230. }
  1231. bool runOnLoop(Loop *L, LPPassManager &LPM) override {
  1232. if (skipLoop(L))
  1233. return false;
  1234. Function &F = *L->getHeader()->getParent();
  1235. auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  1236. LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  1237. ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  1238. const TargetTransformInfo &TTI =
  1239. getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
  1240. auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
  1241. // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
  1242. // pass. Function analyses need to be preserved across loop transformations
  1243. // but ORE cannot be preserved (see comment before the pass definition).
  1244. OptimizationRemarkEmitter ORE(&F);
  1245. bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
  1246. LoopUnrollResult Result = tryToUnrollLoop(
  1247. L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, PreserveLCSSA, OptLevel,
  1248. OnlyWhenForced, ForgetAllSCEV, ProvidedCount, ProvidedThreshold,
  1249. ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
  1250. ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling,
  1251. ProvidedFullUnrollMaxCount);
  1252. if (Result == LoopUnrollResult::FullyUnrolled)
  1253. LPM.markLoopAsDeleted(*L);
  1254. return Result != LoopUnrollResult::Unmodified;
  1255. }
  1256. /// This transformation requires natural loop information & requires that
  1257. /// loop preheaders be inserted into the CFG...
  1258. void getAnalysisUsage(AnalysisUsage &AU) const override {
  1259. AU.addRequired<AssumptionCacheTracker>();
  1260. AU.addRequired<TargetTransformInfoWrapperPass>();
  1261. // FIXME: Loop passes are required to preserve domtree, and for now we just
  1262. // recreate dom info if anything gets unrolled.
  1263. getLoopAnalysisUsage(AU);
  1264. }
  1265. };
  1266. } // end anonymous namespace
  1267. char LoopUnroll::ID = 0;
  1268. INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
  1269. INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
  1270. INITIALIZE_PASS_DEPENDENCY(LoopPass)
  1271. INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
  1272. INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
  1273. Pass *llvm::createLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
  1274. bool ForgetAllSCEV, int Threshold, int Count,
  1275. int AllowPartial, int Runtime, int UpperBound,
  1276. int AllowPeeling) {
  1277. // TODO: It would make more sense for this function to take the optionals
  1278. // directly, but that's dangerous since it would silently break out of tree
  1279. // callers.
  1280. return new LoopUnroll(
  1281. OptLevel, OnlyWhenForced, ForgetAllSCEV,
  1282. Threshold == -1 ? std::nullopt : std::optional<unsigned>(Threshold),
  1283. Count == -1 ? std::nullopt : std::optional<unsigned>(Count),
  1284. AllowPartial == -1 ? std::nullopt : std::optional<bool>(AllowPartial),
  1285. Runtime == -1 ? std::nullopt : std::optional<bool>(Runtime),
  1286. UpperBound == -1 ? std::nullopt : std::optional<bool>(UpperBound),
  1287. AllowPeeling == -1 ? std::nullopt : std::optional<bool>(AllowPeeling));
  1288. }
  1289. Pass *llvm::createSimpleLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
  1290. bool ForgetAllSCEV) {
  1291. return createLoopUnrollPass(OptLevel, OnlyWhenForced, ForgetAllSCEV, -1, -1,
  1292. 0, 0, 0, 1);
  1293. }
  1294. PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
  1295. LoopStandardAnalysisResults &AR,
  1296. LPMUpdater &Updater) {
  1297. // For the new PM, we can't use OptimizationRemarkEmitter as an analysis
  1298. // pass. Function analyses need to be preserved across loop transformations
  1299. // but ORE cannot be preserved (see comment before the pass definition).
  1300. OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
  1301. // Keep track of the previous loop structure so we can identify new loops
  1302. // created by unrolling.
  1303. Loop *ParentL = L.getParentLoop();
  1304. SmallPtrSet<Loop *, 4> OldLoops;
  1305. if (ParentL)
  1306. OldLoops.insert(ParentL->begin(), ParentL->end());
  1307. else
  1308. OldLoops.insert(AR.LI.begin(), AR.LI.end());
  1309. std::string LoopName = std::string(L.getName());
  1310. bool Changed =
  1311. tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, ORE,
  1312. /*BFI*/ nullptr, /*PSI*/ nullptr,
  1313. /*PreserveLCSSA*/ true, OptLevel, OnlyWhenForced,
  1314. ForgetSCEV, /*Count*/ std::nullopt,
  1315. /*Threshold*/ std::nullopt, /*AllowPartial*/ false,
  1316. /*Runtime*/ false, /*UpperBound*/ false,
  1317. /*AllowPeeling*/ true,
  1318. /*AllowProfileBasedPeeling*/ false,
  1319. /*FullUnrollMaxCount*/ std::nullopt) !=
  1320. LoopUnrollResult::Unmodified;
  1321. if (!Changed)
  1322. return PreservedAnalyses::all();
  1323. // The parent must not be damaged by unrolling!
  1324. #ifndef NDEBUG
  1325. if (ParentL)
  1326. ParentL->verifyLoop();
  1327. #endif
  1328. // Unrolling can do several things to introduce new loops into a loop nest:
  1329. // - Full unrolling clones child loops within the current loop but then
  1330. // removes the current loop making all of the children appear to be new
  1331. // sibling loops.
  1332. //
  1333. // When a new loop appears as a sibling loop after fully unrolling,
  1334. // its nesting structure has fundamentally changed and we want to revisit
  1335. // it to reflect that.
  1336. //
  1337. // When unrolling has removed the current loop, we need to tell the
  1338. // infrastructure that it is gone.
  1339. //
  1340. // Finally, we support a debugging/testing mode where we revisit child loops
  1341. // as well. These are not expected to require further optimizations as either
  1342. // they or the loop they were cloned from have been directly visited already.
  1343. // But the debugging mode allows us to check this assumption.
  1344. bool IsCurrentLoopValid = false;
  1345. SmallVector<Loop *, 4> SibLoops;
  1346. if (ParentL)
  1347. SibLoops.append(ParentL->begin(), ParentL->end());
  1348. else
  1349. SibLoops.append(AR.LI.begin(), AR.LI.end());
  1350. erase_if(SibLoops, [&](Loop *SibLoop) {
  1351. if (SibLoop == &L) {
  1352. IsCurrentLoopValid = true;
  1353. return true;
  1354. }
  1355. // Otherwise erase the loop from the list if it was in the old loops.
  1356. return OldLoops.contains(SibLoop);
  1357. });
  1358. Updater.addSiblingLoops(SibLoops);
  1359. if (!IsCurrentLoopValid) {
  1360. Updater.markLoopAsDeleted(L, LoopName);
  1361. } else {
  1362. // We can only walk child loops if the current loop remained valid.
  1363. if (UnrollRevisitChildLoops) {
  1364. // Walk *all* of the child loops.
  1365. SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
  1366. Updater.addChildLoops(ChildLoops);
  1367. }
  1368. }
  1369. return getLoopPassPreservedAnalyses();
  1370. }
  1371. PreservedAnalyses LoopUnrollPass::run(Function &F,
  1372. FunctionAnalysisManager &AM) {
  1373. auto &LI = AM.getResult<LoopAnalysis>(F);
  1374. // There are no loops in the function. Return before computing other expensive
  1375. // analyses.
  1376. if (LI.empty())
  1377. return PreservedAnalyses::all();
  1378. auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
  1379. auto &TTI = AM.getResult<TargetIRAnalysis>(F);
  1380. auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
  1381. auto &AC = AM.getResult<AssumptionAnalysis>(F);
  1382. auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
  1383. LoopAnalysisManager *LAM = nullptr;
  1384. if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
  1385. LAM = &LAMProxy->getManager();
  1386. auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
  1387. ProfileSummaryInfo *PSI =
  1388. MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
  1389. auto *BFI = (PSI && PSI->hasProfileSummary()) ?
  1390. &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
  1391. bool Changed = false;
  1392. // The unroller requires loops to be in simplified form, and also needs LCSSA.
  1393. // Since simplification may add new inner loops, it has to run before the
  1394. // legality and profitability checks. This means running the loop unroller
  1395. // will simplify all loops, regardless of whether anything end up being
  1396. // unrolled.
  1397. for (const auto &L : LI) {
  1398. Changed |=
  1399. simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
  1400. Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
  1401. }
  1402. // Add the loop nests in the reverse order of LoopInfo. See method
  1403. // declaration.
  1404. SmallPriorityWorklist<Loop *, 4> Worklist;
  1405. appendLoopsToWorklist(LI, Worklist);
  1406. while (!Worklist.empty()) {
  1407. // Because the LoopInfo stores the loops in RPO, we walk the worklist
  1408. // from back to front so that we work forward across the CFG, which
  1409. // for unrolling is only needed to get optimization remarks emitted in
  1410. // a forward order.
  1411. Loop &L = *Worklist.pop_back_val();
  1412. #ifndef NDEBUG
  1413. Loop *ParentL = L.getParentLoop();
  1414. #endif
  1415. // Check if the profile summary indicates that the profiled application
  1416. // has a huge working set size, in which case we disable peeling to avoid
  1417. // bloating it further.
  1418. std::optional<bool> LocalAllowPeeling = UnrollOpts.AllowPeeling;
  1419. if (PSI && PSI->hasHugeWorkingSetSize())
  1420. LocalAllowPeeling = false;
  1421. std::string LoopName = std::string(L.getName());
  1422. // The API here is quite complex to call and we allow to select some
  1423. // flavors of unrolling during construction time (by setting UnrollOpts).
  1424. LoopUnrollResult Result = tryToUnrollLoop(
  1425. &L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI,
  1426. /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced,
  1427. UnrollOpts.ForgetSCEV, /*Count*/ std::nullopt,
  1428. /*Threshold*/ std::nullopt, UnrollOpts.AllowPartial,
  1429. UnrollOpts.AllowRuntime, UnrollOpts.AllowUpperBound, LocalAllowPeeling,
  1430. UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount);
  1431. Changed |= Result != LoopUnrollResult::Unmodified;
  1432. // The parent must not be damaged by unrolling!
  1433. #ifndef NDEBUG
  1434. if (Result != LoopUnrollResult::Unmodified && ParentL)
  1435. ParentL->verifyLoop();
  1436. #endif
  1437. // Clear any cached analysis results for L if we removed it completely.
  1438. if (LAM && Result == LoopUnrollResult::FullyUnrolled)
  1439. LAM->clear(L, LoopName);
  1440. }
  1441. if (!Changed)
  1442. return PreservedAnalyses::all();
  1443. return getLoopPassPreservedAnalyses();
  1444. }
  1445. void LoopUnrollPass::printPipeline(
  1446. raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
  1447. static_cast<PassInfoMixin<LoopUnrollPass> *>(this)->printPipeline(
  1448. OS, MapClassName2PassName);
  1449. OS << "<";
  1450. if (UnrollOpts.AllowPartial != std::nullopt)
  1451. OS << (*UnrollOpts.AllowPartial ? "" : "no-") << "partial;";
  1452. if (UnrollOpts.AllowPeeling != std::nullopt)
  1453. OS << (*UnrollOpts.AllowPeeling ? "" : "no-") << "peeling;";
  1454. if (UnrollOpts.AllowRuntime != std::nullopt)
  1455. OS << (*UnrollOpts.AllowRuntime ? "" : "no-") << "runtime;";
  1456. if (UnrollOpts.AllowUpperBound != std::nullopt)
  1457. OS << (*UnrollOpts.AllowUpperBound ? "" : "no-") << "upperbound;";
  1458. if (UnrollOpts.AllowProfileBasedPeeling != std::nullopt)
  1459. OS << (*UnrollOpts.AllowProfileBasedPeeling ? "" : "no-")
  1460. << "profile-peeling;";
  1461. if (UnrollOpts.FullUnrollMaxCount != std::nullopt)
  1462. OS << "full-unroll-max=" << UnrollOpts.FullUnrollMaxCount << ";";
  1463. OS << "O" << UnrollOpts.OptLevel;
  1464. OS << ">";
  1465. }