LoopUnrollPass.cpp 66 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651
  1. //===- LoopUnroll.cpp - Loop unroller pass --------------------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This pass implements a simple loop unroller. It works best when loops have
  10. // been canonicalized by the -indvars pass, allowing it to determine the trip
  11. // counts of loops easily.
  12. //===----------------------------------------------------------------------===//
  13. #include "llvm/Transforms/Scalar/LoopUnrollPass.h"
  14. #include "llvm/ADT/DenseMap.h"
  15. #include "llvm/ADT/DenseMapInfo.h"
  16. #include "llvm/ADT/DenseSet.h"
  17. #include "llvm/ADT/None.h"
  18. #include "llvm/ADT/Optional.h"
  19. #include "llvm/ADT/STLExtras.h"
  20. #include "llvm/ADT/SetVector.h"
  21. #include "llvm/ADT/SmallPtrSet.h"
  22. #include "llvm/ADT/SmallVector.h"
  23. #include "llvm/ADT/StringRef.h"
  24. #include "llvm/Analysis/AssumptionCache.h"
  25. #include "llvm/Analysis/BlockFrequencyInfo.h"
  26. #include "llvm/Analysis/CodeMetrics.h"
  27. #include "llvm/Analysis/LazyBlockFrequencyInfo.h"
  28. #include "llvm/Analysis/LoopAnalysisManager.h"
  29. #include "llvm/Analysis/LoopInfo.h"
  30. #include "llvm/Analysis/LoopPass.h"
  31. #include "llvm/Analysis/LoopUnrollAnalyzer.h"
  32. #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  33. #include "llvm/Analysis/ProfileSummaryInfo.h"
  34. #include "llvm/Analysis/ScalarEvolution.h"
  35. #include "llvm/Analysis/TargetTransformInfo.h"
  36. #include "llvm/IR/BasicBlock.h"
  37. #include "llvm/IR/CFG.h"
  38. #include "llvm/IR/Constant.h"
  39. #include "llvm/IR/Constants.h"
  40. #include "llvm/IR/DiagnosticInfo.h"
  41. #include "llvm/IR/Dominators.h"
  42. #include "llvm/IR/Function.h"
  43. #include "llvm/IR/Instruction.h"
  44. #include "llvm/IR/Instructions.h"
  45. #include "llvm/IR/IntrinsicInst.h"
  46. #include "llvm/IR/Metadata.h"
  47. #include "llvm/IR/PassManager.h"
  48. #include "llvm/InitializePasses.h"
  49. #include "llvm/Pass.h"
  50. #include "llvm/Support/Casting.h"
  51. #include "llvm/Support/CommandLine.h"
  52. #include "llvm/Support/Debug.h"
  53. #include "llvm/Support/ErrorHandling.h"
  54. #include "llvm/Support/raw_ostream.h"
  55. #include "llvm/Transforms/Scalar.h"
  56. #include "llvm/Transforms/Scalar/LoopPassManager.h"
  57. #include "llvm/Transforms/Utils.h"
  58. #include "llvm/Transforms/Utils/LoopPeel.h"
  59. #include "llvm/Transforms/Utils/LoopSimplify.h"
  60. #include "llvm/Transforms/Utils/LoopUtils.h"
  61. #include "llvm/Transforms/Utils/SizeOpts.h"
  62. #include "llvm/Transforms/Utils/UnrollLoop.h"
  63. #include <algorithm>
  64. #include <cassert>
  65. #include <cstdint>
  66. #include <limits>
  67. #include <string>
  68. #include <tuple>
  69. #include <utility>
  70. using namespace llvm;
  71. #define DEBUG_TYPE "loop-unroll"
  72. cl::opt<bool> llvm::ForgetSCEVInLoopUnroll(
  73. "forget-scev-loop-unroll", cl::init(false), cl::Hidden,
  74. cl::desc("Forget everything in SCEV when doing LoopUnroll, instead of just"
  75. " the current top-most loop. This is sometimes preferred to reduce"
  76. " compile time."));
  77. static cl::opt<unsigned>
  78. UnrollThreshold("unroll-threshold", cl::Hidden,
  79. cl::desc("The cost threshold for loop unrolling"));
  80. static cl::opt<unsigned>
  81. UnrollOptSizeThreshold(
  82. "unroll-optsize-threshold", cl::init(0), cl::Hidden,
  83. cl::desc("The cost threshold for loop unrolling when optimizing for "
  84. "size"));
  85. static cl::opt<unsigned> UnrollPartialThreshold(
  86. "unroll-partial-threshold", cl::Hidden,
  87. cl::desc("The cost threshold for partial loop unrolling"));
  88. static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
  89. "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
  90. cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied "
  91. "to the threshold when aggressively unrolling a loop due to the "
  92. "dynamic cost savings. If completely unrolling a loop will reduce "
  93. "the total runtime from X to Y, we boost the loop unroll "
  94. "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
  95. "X/Y). This limit avoids excessive code bloat."));
  96. static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
  97. "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
  98. cl::desc("Don't allow loop unrolling to simulate more than this number of"
  99. "iterations when checking full unroll profitability"));
  100. static cl::opt<unsigned> UnrollCount(
  101. "unroll-count", cl::Hidden,
  102. cl::desc("Use this unroll count for all loops including those with "
  103. "unroll_count pragma values, for testing purposes"));
  104. static cl::opt<unsigned> UnrollMaxCount(
  105. "unroll-max-count", cl::Hidden,
  106. cl::desc("Set the max unroll count for partial and runtime unrolling, for"
  107. "testing purposes"));
  108. static cl::opt<unsigned> UnrollFullMaxCount(
  109. "unroll-full-max-count", cl::Hidden,
  110. cl::desc(
  111. "Set the max unroll count for full unrolling, for testing purposes"));
  112. static cl::opt<bool>
  113. UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
  114. cl::desc("Allows loops to be partially unrolled until "
  115. "-unroll-threshold loop size is reached."));
  116. static cl::opt<bool> UnrollAllowRemainder(
  117. "unroll-allow-remainder", cl::Hidden,
  118. cl::desc("Allow generation of a loop remainder (extra iterations) "
  119. "when unrolling a loop."));
  120. static cl::opt<bool>
  121. UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden,
  122. cl::desc("Unroll loops with run-time trip counts"));
  123. static cl::opt<unsigned> UnrollMaxUpperBound(
  124. "unroll-max-upperbound", cl::init(8), cl::Hidden,
  125. cl::desc(
  126. "The max of trip count upper bound that is considered in unrolling"));
  127. static cl::opt<unsigned> PragmaUnrollThreshold(
  128. "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
  129. cl::desc("Unrolled size limit for loops with an unroll(full) or "
  130. "unroll_count pragma."));
  131. static cl::opt<unsigned> FlatLoopTripCountThreshold(
  132. "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden,
  133. cl::desc("If the runtime tripcount for the loop is lower than the "
  134. "threshold, the loop is considered as flat and will be less "
  135. "aggressively unrolled."));
  136. static cl::opt<bool> UnrollUnrollRemainder(
  137. "unroll-remainder", cl::Hidden,
  138. cl::desc("Allow the loop remainder to be unrolled."));
  139. // This option isn't ever intended to be enabled, it serves to allow
  140. // experiments to check the assumptions about when this kind of revisit is
  141. // necessary.
  142. static cl::opt<bool> UnrollRevisitChildLoops(
  143. "unroll-revisit-child-loops", cl::Hidden,
  144. cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
  145. "This shouldn't typically be needed as child loops (or their "
  146. "clones) were already visited."));
  147. static cl::opt<unsigned> UnrollThresholdAggressive(
  148. "unroll-threshold-aggressive", cl::init(300), cl::Hidden,
  149. cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) "
  150. "optimizations"));
  151. static cl::opt<unsigned>
  152. UnrollThresholdDefault("unroll-threshold-default", cl::init(150),
  153. cl::Hidden,
  154. cl::desc("Default threshold (max size of unrolled "
  155. "loop), used in all but O3 optimizations"));
  156. /// A magic value for use with the Threshold parameter to indicate
  157. /// that the loop unroll should be performed regardless of how much
  158. /// code expansion would result.
  159. static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
  160. /// Gather the various unrolling parameters based on the defaults, compiler
  161. /// flags, TTI overrides and user specified parameters.
  162. TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
  163. Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
  164. BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
  165. OptimizationRemarkEmitter &ORE, int OptLevel,
  166. Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
  167. Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
  168. Optional<bool> UserUpperBound, Optional<unsigned> UserFullUnrollMaxCount) {
  169. TargetTransformInfo::UnrollingPreferences UP;
  170. // Set up the defaults
  171. UP.Threshold =
  172. OptLevel > 2 ? UnrollThresholdAggressive : UnrollThresholdDefault;
  173. UP.MaxPercentThresholdBoost = 400;
  174. UP.OptSizeThreshold = UnrollOptSizeThreshold;
  175. UP.PartialThreshold = 150;
  176. UP.PartialOptSizeThreshold = UnrollOptSizeThreshold;
  177. UP.Count = 0;
  178. UP.DefaultUnrollRuntimeCount = 8;
  179. UP.MaxCount = std::numeric_limits<unsigned>::max();
  180. UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
  181. UP.BEInsns = 2;
  182. UP.Partial = false;
  183. UP.Runtime = false;
  184. UP.AllowRemainder = true;
  185. UP.UnrollRemainder = false;
  186. UP.AllowExpensiveTripCount = false;
  187. UP.Force = false;
  188. UP.UpperBound = false;
  189. UP.UnrollAndJam = false;
  190. UP.UnrollAndJamInnerLoopThreshold = 60;
  191. UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
  192. // Override with any target specific settings
  193. TTI.getUnrollingPreferences(L, SE, UP, &ORE);
  194. // Apply size attributes
  195. bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
  196. // Let unroll hints / pragmas take precedence over PGSO.
  197. (hasUnrollTransformation(L) != TM_ForcedByUser &&
  198. llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
  199. PGSOQueryType::IRPass));
  200. if (OptForSize) {
  201. UP.Threshold = UP.OptSizeThreshold;
  202. UP.PartialThreshold = UP.PartialOptSizeThreshold;
  203. UP.MaxPercentThresholdBoost = 100;
  204. }
  205. // Apply any user values specified by cl::opt
  206. if (UnrollThreshold.getNumOccurrences() > 0)
  207. UP.Threshold = UnrollThreshold;
  208. if (UnrollPartialThreshold.getNumOccurrences() > 0)
  209. UP.PartialThreshold = UnrollPartialThreshold;
  210. if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
  211. UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
  212. if (UnrollMaxCount.getNumOccurrences() > 0)
  213. UP.MaxCount = UnrollMaxCount;
  214. if (UnrollFullMaxCount.getNumOccurrences() > 0)
  215. UP.FullUnrollMaxCount = UnrollFullMaxCount;
  216. if (UnrollAllowPartial.getNumOccurrences() > 0)
  217. UP.Partial = UnrollAllowPartial;
  218. if (UnrollAllowRemainder.getNumOccurrences() > 0)
  219. UP.AllowRemainder = UnrollAllowRemainder;
  220. if (UnrollRuntime.getNumOccurrences() > 0)
  221. UP.Runtime = UnrollRuntime;
  222. if (UnrollMaxUpperBound == 0)
  223. UP.UpperBound = false;
  224. if (UnrollUnrollRemainder.getNumOccurrences() > 0)
  225. UP.UnrollRemainder = UnrollUnrollRemainder;
  226. if (UnrollMaxIterationsCountToAnalyze.getNumOccurrences() > 0)
  227. UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
  228. // Apply user values provided by argument
  229. if (UserThreshold.hasValue()) {
  230. UP.Threshold = *UserThreshold;
  231. UP.PartialThreshold = *UserThreshold;
  232. }
  233. if (UserCount.hasValue())
  234. UP.Count = *UserCount;
  235. if (UserAllowPartial.hasValue())
  236. UP.Partial = *UserAllowPartial;
  237. if (UserRuntime.hasValue())
  238. UP.Runtime = *UserRuntime;
  239. if (UserUpperBound.hasValue())
  240. UP.UpperBound = *UserUpperBound;
  241. if (UserFullUnrollMaxCount.hasValue())
  242. UP.FullUnrollMaxCount = *UserFullUnrollMaxCount;
  243. return UP;
  244. }
  245. namespace {
  246. /// A struct to densely store the state of an instruction after unrolling at
  247. /// each iteration.
  248. ///
  249. /// This is designed to work like a tuple of <Instruction *, int> for the
  250. /// purposes of hashing and lookup, but to be able to associate two boolean
  251. /// states with each key.
  252. struct UnrolledInstState {
  253. Instruction *I;
  254. int Iteration : 30;
  255. unsigned IsFree : 1;
  256. unsigned IsCounted : 1;
  257. };
  258. /// Hashing and equality testing for a set of the instruction states.
  259. struct UnrolledInstStateKeyInfo {
  260. using PtrInfo = DenseMapInfo<Instruction *>;
  261. using PairInfo = DenseMapInfo<std::pair<Instruction *, int>>;
  262. static inline UnrolledInstState getEmptyKey() {
  263. return {PtrInfo::getEmptyKey(), 0, 0, 0};
  264. }
  265. static inline UnrolledInstState getTombstoneKey() {
  266. return {PtrInfo::getTombstoneKey(), 0, 0, 0};
  267. }
  268. static inline unsigned getHashValue(const UnrolledInstState &S) {
  269. return PairInfo::getHashValue({S.I, S.Iteration});
  270. }
  271. static inline bool isEqual(const UnrolledInstState &LHS,
  272. const UnrolledInstState &RHS) {
  273. return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
  274. }
  275. };
  276. struct EstimatedUnrollCost {
  277. /// The estimated cost after unrolling.
  278. unsigned UnrolledCost;
  279. /// The estimated dynamic cost of executing the instructions in the
  280. /// rolled form.
  281. unsigned RolledDynamicCost;
  282. };
  283. struct PragmaInfo {
  284. PragmaInfo(bool UUC, bool PFU, unsigned PC, bool PEU)
  285. : UserUnrollCount(UUC), PragmaFullUnroll(PFU), PragmaCount(PC),
  286. PragmaEnableUnroll(PEU) {}
  287. const bool UserUnrollCount;
  288. const bool PragmaFullUnroll;
  289. const unsigned PragmaCount;
  290. const bool PragmaEnableUnroll;
  291. };
  292. } // end anonymous namespace
  293. /// Figure out if the loop is worth full unrolling.
  294. ///
  295. /// Complete loop unrolling can make some loads constant, and we need to know
  296. /// if that would expose any further optimization opportunities. This routine
  297. /// estimates this optimization. It computes cost of unrolled loop
  298. /// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By
  299. /// dynamic cost we mean that we won't count costs of blocks that are known not
  300. /// to be executed (i.e. if we have a branch in the loop and we know that at the
  301. /// given iteration its condition would be resolved to true, we won't add up the
  302. /// cost of the 'false'-block).
  303. /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
  304. /// the analysis failed (no benefits expected from the unrolling, or the loop is
  305. /// too big to analyze), the returned value is None.
  306. static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
  307. const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE,
  308. const SmallPtrSetImpl<const Value *> &EphValues,
  309. const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize,
  310. unsigned MaxIterationsCountToAnalyze) {
  311. // We want to be able to scale offsets by the trip count and add more offsets
  312. // to them without checking for overflows, and we already don't want to
  313. // analyze *massive* trip counts, so we force the max to be reasonably small.
  314. assert(MaxIterationsCountToAnalyze <
  315. (unsigned)(std::numeric_limits<int>::max() / 2) &&
  316. "The unroll iterations max is too large!");
  317. // Only analyze inner loops. We can't properly estimate cost of nested loops
  318. // and we won't visit inner loops again anyway.
  319. if (!L->isInnermost())
  320. return None;
  321. // Don't simulate loops with a big or unknown tripcount
  322. if (!TripCount || TripCount > MaxIterationsCountToAnalyze)
  323. return None;
  324. SmallSetVector<BasicBlock *, 16> BBWorklist;
  325. SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist;
  326. DenseMap<Value *, Value *> SimplifiedValues;
  327. SmallVector<std::pair<Value *, Value *>, 4> SimplifiedInputValues;
  328. // The estimated cost of the unrolled form of the loop. We try to estimate
  329. // this by simplifying as much as we can while computing the estimate.
  330. InstructionCost UnrolledCost = 0;
  331. // We also track the estimated dynamic (that is, actually executed) cost in
  332. // the rolled form. This helps identify cases when the savings from unrolling
  333. // aren't just exposing dead control flows, but actual reduced dynamic
  334. // instructions due to the simplifications which we expect to occur after
  335. // unrolling.
  336. InstructionCost RolledDynamicCost = 0;
  337. // We track the simplification of each instruction in each iteration. We use
  338. // this to recursively merge costs into the unrolled cost on-demand so that
  339. // we don't count the cost of any dead code. This is essentially a map from
  340. // <instruction, int> to <bool, bool>, but stored as a densely packed struct.
  341. DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap;
  342. // A small worklist used to accumulate cost of instructions from each
  343. // observable and reached root in the loop.
  344. SmallVector<Instruction *, 16> CostWorklist;
  345. // PHI-used worklist used between iterations while accumulating cost.
  346. SmallVector<Instruction *, 4> PHIUsedList;
  347. // Helper function to accumulate cost for instructions in the loop.
  348. auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
  349. assert(Iteration >= 0 && "Cannot have a negative iteration!");
  350. assert(CostWorklist.empty() && "Must start with an empty cost list");
  351. assert(PHIUsedList.empty() && "Must start with an empty phi used list");
  352. CostWorklist.push_back(&RootI);
  353. TargetTransformInfo::TargetCostKind CostKind =
  354. RootI.getFunction()->hasMinSize() ?
  355. TargetTransformInfo::TCK_CodeSize :
  356. TargetTransformInfo::TCK_SizeAndLatency;
  357. for (;; --Iteration) {
  358. do {
  359. Instruction *I = CostWorklist.pop_back_val();
  360. // InstCostMap only uses I and Iteration as a key, the other two values
  361. // don't matter here.
  362. auto CostIter = InstCostMap.find({I, Iteration, 0, 0});
  363. if (CostIter == InstCostMap.end())
  364. // If an input to a PHI node comes from a dead path through the loop
  365. // we may have no cost data for it here. What that actually means is
  366. // that it is free.
  367. continue;
  368. auto &Cost = *CostIter;
  369. if (Cost.IsCounted)
  370. // Already counted this instruction.
  371. continue;
  372. // Mark that we are counting the cost of this instruction now.
  373. Cost.IsCounted = true;
  374. // If this is a PHI node in the loop header, just add it to the PHI set.
  375. if (auto *PhiI = dyn_cast<PHINode>(I))
  376. if (PhiI->getParent() == L->getHeader()) {
  377. assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
  378. "inherently simplify during unrolling.");
  379. if (Iteration == 0)
  380. continue;
  381. // Push the incoming value from the backedge into the PHI used list
  382. // if it is an in-loop instruction. We'll use this to populate the
  383. // cost worklist for the next iteration (as we count backwards).
  384. if (auto *OpI = dyn_cast<Instruction>(
  385. PhiI->getIncomingValueForBlock(L->getLoopLatch())))
  386. if (L->contains(OpI))
  387. PHIUsedList.push_back(OpI);
  388. continue;
  389. }
  390. // First accumulate the cost of this instruction.
  391. if (!Cost.IsFree) {
  392. UnrolledCost += TTI.getUserCost(I, CostKind);
  393. LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration "
  394. << Iteration << "): ");
  395. LLVM_DEBUG(I->dump());
  396. }
  397. // We must count the cost of every operand which is not free,
  398. // recursively. If we reach a loop PHI node, simply add it to the set
  399. // to be considered on the next iteration (backwards!).
  400. for (Value *Op : I->operands()) {
  401. // Check whether this operand is free due to being a constant or
  402. // outside the loop.
  403. auto *OpI = dyn_cast<Instruction>(Op);
  404. if (!OpI || !L->contains(OpI))
  405. continue;
  406. // Otherwise accumulate its cost.
  407. CostWorklist.push_back(OpI);
  408. }
  409. } while (!CostWorklist.empty());
  410. if (PHIUsedList.empty())
  411. // We've exhausted the search.
  412. break;
  413. assert(Iteration > 0 &&
  414. "Cannot track PHI-used values past the first iteration!");
  415. CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());
  416. PHIUsedList.clear();
  417. }
  418. };
  419. // Ensure that we don't violate the loop structure invariants relied on by
  420. // this analysis.
  421. assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
  422. assert(L->isLCSSAForm(DT) &&
  423. "Must have loops in LCSSA form to track live-out values.");
  424. LLVM_DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
  425. TargetTransformInfo::TargetCostKind CostKind =
  426. L->getHeader()->getParent()->hasMinSize() ?
  427. TargetTransformInfo::TCK_CodeSize : TargetTransformInfo::TCK_SizeAndLatency;
  428. // Simulate execution of each iteration of the loop counting instructions,
  429. // which would be simplified.
  430. // Since the same load will take different values on different iterations,
  431. // we literally have to go through all loop's iterations.
  432. for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
  433. LLVM_DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
  434. // Prepare for the iteration by collecting any simplified entry or backedge
  435. // inputs.
  436. for (Instruction &I : *L->getHeader()) {
  437. auto *PHI = dyn_cast<PHINode>(&I);
  438. if (!PHI)
  439. break;
  440. // The loop header PHI nodes must have exactly two input: one from the
  441. // loop preheader and one from the loop latch.
  442. assert(
  443. PHI->getNumIncomingValues() == 2 &&
  444. "Must have an incoming value only for the preheader and the latch.");
  445. Value *V = PHI->getIncomingValueForBlock(
  446. Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
  447. if (Iteration != 0 && SimplifiedValues.count(V))
  448. V = SimplifiedValues.lookup(V);
  449. SimplifiedInputValues.push_back({PHI, V});
  450. }
  451. // Now clear and re-populate the map for the next iteration.
  452. SimplifiedValues.clear();
  453. while (!SimplifiedInputValues.empty())
  454. SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
  455. UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
  456. BBWorklist.clear();
  457. BBWorklist.insert(L->getHeader());
  458. // Note that we *must not* cache the size, this loop grows the worklist.
  459. for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
  460. BasicBlock *BB = BBWorklist[Idx];
  461. // Visit all instructions in the given basic block and try to simplify
  462. // it. We don't change the actual IR, just count optimization
  463. // opportunities.
  464. for (Instruction &I : *BB) {
  465. // These won't get into the final code - don't even try calculating the
  466. // cost for them.
  467. if (isa<DbgInfoIntrinsic>(I) || EphValues.count(&I))
  468. continue;
  469. // Track this instruction's expected baseline cost when executing the
  470. // rolled loop form.
  471. RolledDynamicCost += TTI.getUserCost(&I, CostKind);
  472. // Visit the instruction to analyze its loop cost after unrolling,
  473. // and if the visitor returns true, mark the instruction as free after
  474. // unrolling and continue.
  475. bool IsFree = Analyzer.visit(I);
  476. bool Inserted = InstCostMap.insert({&I, (int)Iteration,
  477. (unsigned)IsFree,
  478. /*IsCounted*/ false}).second;
  479. (void)Inserted;
  480. assert(Inserted && "Cannot have a state for an unvisited instruction!");
  481. if (IsFree)
  482. continue;
  483. // Can't properly model a cost of a call.
  484. // FIXME: With a proper cost model we should be able to do it.
  485. if (auto *CI = dyn_cast<CallInst>(&I)) {
  486. const Function *Callee = CI->getCalledFunction();
  487. if (!Callee || TTI.isLoweredToCall(Callee)) {
  488. LLVM_DEBUG(dbgs() << "Can't analyze cost of loop with call\n");
  489. return None;
  490. }
  491. }
  492. // If the instruction might have a side-effect recursively account for
  493. // the cost of it and all the instructions leading up to it.
  494. if (I.mayHaveSideEffects())
  495. AddCostRecursively(I, Iteration);
  496. // If unrolled body turns out to be too big, bail out.
  497. if (UnrolledCost > MaxUnrolledLoopSize) {
  498. LLVM_DEBUG(dbgs() << " Exceeded threshold.. exiting.\n"
  499. << " UnrolledCost: " << UnrolledCost
  500. << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
  501. << "\n");
  502. return None;
  503. }
  504. }
  505. Instruction *TI = BB->getTerminator();
  506. auto getSimplifiedConstant = [&](Value *V) -> Constant * {
  507. if (SimplifiedValues.count(V))
  508. V = SimplifiedValues.lookup(V);
  509. return dyn_cast<Constant>(V);
  510. };
  511. // Add in the live successors by first checking whether we have terminator
  512. // that may be simplified based on the values simplified by this call.
  513. BasicBlock *KnownSucc = nullptr;
  514. if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
  515. if (BI->isConditional()) {
  516. if (auto *SimpleCond = getSimplifiedConstant(BI->getCondition())) {
  517. // Just take the first successor if condition is undef
  518. if (isa<UndefValue>(SimpleCond))
  519. KnownSucc = BI->getSuccessor(0);
  520. else if (ConstantInt *SimpleCondVal =
  521. dyn_cast<ConstantInt>(SimpleCond))
  522. KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);
  523. }
  524. }
  525. } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
  526. if (auto *SimpleCond = getSimplifiedConstant(SI->getCondition())) {
  527. // Just take the first successor if condition is undef
  528. if (isa<UndefValue>(SimpleCond))
  529. KnownSucc = SI->getSuccessor(0);
  530. else if (ConstantInt *SimpleCondVal =
  531. dyn_cast<ConstantInt>(SimpleCond))
  532. KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
  533. }
  534. }
  535. if (KnownSucc) {
  536. if (L->contains(KnownSucc))
  537. BBWorklist.insert(KnownSucc);
  538. else
  539. ExitWorklist.insert({BB, KnownSucc});
  540. continue;
  541. }
  542. // Add BB's successors to the worklist.
  543. for (BasicBlock *Succ : successors(BB))
  544. if (L->contains(Succ))
  545. BBWorklist.insert(Succ);
  546. else
  547. ExitWorklist.insert({BB, Succ});
  548. AddCostRecursively(*TI, Iteration);
  549. }
  550. // If we found no optimization opportunities on the first iteration, we
  551. // won't find them on later ones too.
  552. if (UnrolledCost == RolledDynamicCost) {
  553. LLVM_DEBUG(dbgs() << " No opportunities found.. exiting.\n"
  554. << " UnrolledCost: " << UnrolledCost << "\n");
  555. return None;
  556. }
  557. }
  558. while (!ExitWorklist.empty()) {
  559. BasicBlock *ExitingBB, *ExitBB;
  560. std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();
  561. for (Instruction &I : *ExitBB) {
  562. auto *PN = dyn_cast<PHINode>(&I);
  563. if (!PN)
  564. break;
  565. Value *Op = PN->getIncomingValueForBlock(ExitingBB);
  566. if (auto *OpI = dyn_cast<Instruction>(Op))
  567. if (L->contains(OpI))
  568. AddCostRecursively(*OpI, TripCount - 1);
  569. }
  570. }
  571. assert(UnrolledCost.isValid() && RolledDynamicCost.isValid() &&
  572. "All instructions must have a valid cost, whether the "
  573. "loop is rolled or unrolled.");
  574. LLVM_DEBUG(dbgs() << "Analysis finished:\n"
  575. << "UnrolledCost: " << UnrolledCost << ", "
  576. << "RolledDynamicCost: " << RolledDynamicCost << "\n");
  577. return {{unsigned(*UnrolledCost.getValue()),
  578. unsigned(*RolledDynamicCost.getValue())}};
  579. }
  580. /// ApproximateLoopSize - Approximate the size of the loop.
  581. unsigned llvm::ApproximateLoopSize(
  582. const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent,
  583. const TargetTransformInfo &TTI,
  584. const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
  585. CodeMetrics Metrics;
  586. for (BasicBlock *BB : L->blocks())
  587. Metrics.analyzeBasicBlock(BB, TTI, EphValues);
  588. NumCalls = Metrics.NumInlineCandidates;
  589. NotDuplicatable = Metrics.notDuplicatable;
  590. Convergent = Metrics.convergent;
  591. unsigned LoopSize = Metrics.NumInsts;
  592. // Don't allow an estimate of size zero. This would allows unrolling of loops
  593. // with huge iteration counts, which is a compile time problem even if it's
  594. // not a problem for code quality. Also, the code using this size may assume
  595. // that each loop has at least three instructions (likely a conditional
  596. // branch, a comparison feeding that branch, and some kind of loop increment
  597. // feeding that comparison instruction).
  598. LoopSize = std::max(LoopSize, BEInsns + 1);
  599. return LoopSize;
  600. }
  601. // Returns the loop hint metadata node with the given name (for example,
  602. // "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is
  603. // returned.
  604. static MDNode *getUnrollMetadataForLoop(const Loop *L, StringRef Name) {
  605. if (MDNode *LoopID = L->getLoopID())
  606. return GetUnrollMetadata(LoopID, Name);
  607. return nullptr;
  608. }
  609. // Returns true if the loop has an unroll(full) pragma.
  610. static bool hasUnrollFullPragma(const Loop *L) {
  611. return getUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
  612. }
  613. // Returns true if the loop has an unroll(enable) pragma. This metadata is used
  614. // for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
  615. static bool hasUnrollEnablePragma(const Loop *L) {
  616. return getUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
  617. }
  618. // Returns true if the loop has an runtime unroll(disable) pragma.
  619. static bool hasRuntimeUnrollDisablePragma(const Loop *L) {
  620. return getUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
  621. }
  622. // If loop has an unroll_count pragma return the (necessarily
  623. // positive) value from the pragma. Otherwise return 0.
  624. static unsigned unrollCountPragmaValue(const Loop *L) {
  625. MDNode *MD = getUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
  626. if (MD) {
  627. assert(MD->getNumOperands() == 2 &&
  628. "Unroll count hint metadata should have two operands.");
  629. unsigned Count =
  630. mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
  631. assert(Count >= 1 && "Unroll count must be positive.");
  632. return Count;
  633. }
  634. return 0;
  635. }
  636. // Computes the boosting factor for complete unrolling.
  637. // If fully unrolling the loop would save a lot of RolledDynamicCost, it would
  638. // be beneficial to fully unroll the loop even if unrolledcost is large. We
  639. // use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust
  640. // the unroll threshold.
  641. static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
  642. unsigned MaxPercentThresholdBoost) {
  643. if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100)
  644. return 100;
  645. else if (Cost.UnrolledCost != 0)
  646. // The boosting factor is RolledDynamicCost / UnrolledCost
  647. return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost,
  648. MaxPercentThresholdBoost);
  649. else
  650. return MaxPercentThresholdBoost;
  651. }
  652. // Produce an estimate of the unrolled cost of the specified loop. This
  653. // is used to a) produce a cost estimate for partial unrolling and b) to
  654. // cheaply estimate cost for full unrolling when we don't want to symbolically
  655. // evaluate all iterations.
  656. class UnrollCostEstimator {
  657. const unsigned LoopSize;
  658. public:
  659. UnrollCostEstimator(Loop &L, unsigned LoopSize) : LoopSize(LoopSize) {}
  660. // Returns loop size estimation for unrolled loop, given the unrolling
  661. // configuration specified by UP.
  662. uint64_t
  663. getUnrolledLoopSize(const TargetTransformInfo::UnrollingPreferences &UP,
  664. const unsigned CountOverwrite = 0) const {
  665. assert(LoopSize >= UP.BEInsns &&
  666. "LoopSize should not be less than BEInsns!");
  667. if (CountOverwrite)
  668. return static_cast<uint64_t>(LoopSize - UP.BEInsns) * CountOverwrite +
  669. UP.BEInsns;
  670. else
  671. return static_cast<uint64_t>(LoopSize - UP.BEInsns) * UP.Count +
  672. UP.BEInsns;
  673. }
  674. };
  675. static Optional<unsigned>
  676. shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo,
  677. const unsigned TripMultiple, const unsigned TripCount,
  678. const UnrollCostEstimator UCE,
  679. const TargetTransformInfo::UnrollingPreferences &UP) {
  680. // Using unroll pragma
  681. // 1st priority is unroll count set by "unroll-count" option.
  682. if (PInfo.UserUnrollCount) {
  683. if (UP.AllowRemainder &&
  684. UCE.getUnrolledLoopSize(UP, (unsigned)UnrollCount) < UP.Threshold)
  685. return (unsigned)UnrollCount;
  686. }
  687. // 2nd priority is unroll count set by pragma.
  688. if (PInfo.PragmaCount > 0) {
  689. if ((UP.AllowRemainder || (TripMultiple % PInfo.PragmaCount == 0)) &&
  690. UCE.getUnrolledLoopSize(UP, PInfo.PragmaCount) < PragmaUnrollThreshold)
  691. return PInfo.PragmaCount;
  692. }
  693. if (PInfo.PragmaFullUnroll && TripCount != 0) {
  694. if (UCE.getUnrolledLoopSize(UP, TripCount) < PragmaUnrollThreshold)
  695. return TripCount;
  696. }
  697. // if didn't return until here, should continue to other priorties
  698. return None;
  699. }
  700. static Optional<unsigned> shouldFullUnroll(
  701. Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT,
  702. ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
  703. const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE,
  704. const TargetTransformInfo::UnrollingPreferences &UP) {
  705. assert(FullUnrollTripCount && "should be non-zero!");
  706. if (FullUnrollTripCount > UP.FullUnrollMaxCount)
  707. return None;
  708. // When computing the unrolled size, note that BEInsns are not replicated
  709. // like the rest of the loop body.
  710. if (UCE.getUnrolledLoopSize(UP) < UP.Threshold)
  711. return FullUnrollTripCount;
  712. // The loop isn't that small, but we still can fully unroll it if that
  713. // helps to remove a significant number of instructions.
  714. // To check that, run additional analysis on the loop.
  715. if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
  716. L, FullUnrollTripCount, DT, SE, EphValues, TTI,
  717. UP.Threshold * UP.MaxPercentThresholdBoost / 100,
  718. UP.MaxIterationsCountToAnalyze)) {
  719. unsigned Boost =
  720. getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
  721. if (Cost->UnrolledCost < UP.Threshold * Boost / 100)
  722. return FullUnrollTripCount;
  723. }
  724. return None;
  725. }
  726. static Optional<unsigned>
  727. shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount,
  728. const UnrollCostEstimator UCE,
  729. const TargetTransformInfo::UnrollingPreferences &UP) {
  730. if (!TripCount)
  731. return None;
  732. if (!UP.Partial) {
  733. LLVM_DEBUG(dbgs() << " will not try to unroll partially because "
  734. << "-unroll-allow-partial not given\n");
  735. return 0;
  736. }
  737. unsigned count = UP.Count;
  738. if (count == 0)
  739. count = TripCount;
  740. if (UP.PartialThreshold != NoThreshold) {
  741. // Reduce unroll count to be modulo of TripCount for partial unrolling.
  742. if (UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold)
  743. count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
  744. (LoopSize - UP.BEInsns);
  745. if (count > UP.MaxCount)
  746. count = UP.MaxCount;
  747. while (count != 0 && TripCount % count != 0)
  748. count--;
  749. if (UP.AllowRemainder && count <= 1) {
  750. // If there is no Count that is modulo of TripCount, set Count to
  751. // largest power-of-two factor that satisfies the threshold limit.
  752. // As we'll create fixup loop, do the type of unrolling only if
  753. // remainder loop is allowed.
  754. count = UP.DefaultUnrollRuntimeCount;
  755. while (count != 0 &&
  756. UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold)
  757. count >>= 1;
  758. }
  759. if (count < 2) {
  760. count = 0;
  761. }
  762. } else {
  763. count = TripCount;
  764. }
  765. if (count > UP.MaxCount)
  766. count = UP.MaxCount;
  767. LLVM_DEBUG(dbgs() << " partially unrolling with count: " << count << "\n");
  768. return count;
  769. }
  770. // Returns true if unroll count was set explicitly.
  771. // Calculates unroll count and writes it to UP.Count.
  772. // Unless IgnoreUser is true, will also use metadata and command-line options
  773. // that are specific to to the LoopUnroll pass (which, for instance, are
  774. // irrelevant for the LoopUnrollAndJam pass).
  775. // FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes
  776. // many LoopUnroll-specific options. The shared functionality should be
  777. // refactored into it own function.
  778. bool llvm::computeUnrollCount(
  779. Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
  780. ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
  781. OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount,
  782. bool MaxOrZero, unsigned TripMultiple, unsigned LoopSize,
  783. TargetTransformInfo::UnrollingPreferences &UP,
  784. TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) {
  785. UnrollCostEstimator UCE(*L, LoopSize);
  786. const bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
  787. const bool PragmaFullUnroll = hasUnrollFullPragma(L);
  788. const unsigned PragmaCount = unrollCountPragmaValue(L);
  789. const bool PragmaEnableUnroll = hasUnrollEnablePragma(L);
  790. const bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
  791. PragmaEnableUnroll || UserUnrollCount;
  792. PragmaInfo PInfo(UserUnrollCount, PragmaFullUnroll, PragmaCount,
  793. PragmaEnableUnroll);
  794. // Use an explicit peel count that has been specified for testing. In this
  795. // case it's not permitted to also specify an explicit unroll count.
  796. if (PP.PeelCount) {
  797. if (UnrollCount.getNumOccurrences() > 0) {
  798. report_fatal_error("Cannot specify both explicit peel count and "
  799. "explicit unroll count");
  800. }
  801. UP.Count = 1;
  802. UP.Runtime = false;
  803. return true;
  804. }
  805. // Check for explicit Count.
  806. // 1st priority is unroll count set by "unroll-count" option.
  807. // 2nd priority is unroll count set by pragma.
  808. if (auto UnrollFactor = shouldPragmaUnroll(L, PInfo, TripMultiple, TripCount,
  809. UCE, UP)) {
  810. UP.Count = *UnrollFactor;
  811. if (UserUnrollCount || (PragmaCount > 0)) {
  812. UP.AllowExpensiveTripCount = true;
  813. UP.Force = true;
  814. }
  815. UP.Runtime |= (PragmaCount > 0);
  816. return ExplicitUnroll;
  817. } else {
  818. if (ExplicitUnroll && TripCount != 0) {
  819. // If the loop has an unrolling pragma, we want to be more aggressive with
  820. // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
  821. // value which is larger than the default limits.
  822. UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
  823. UP.PartialThreshold =
  824. std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
  825. }
  826. }
  827. // 3rd priority is exact full unrolling. This will eliminate all copies
  828. // of some exit test.
  829. UP.Count = 0;
  830. if (TripCount) {
  831. UP.Count = TripCount;
  832. if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
  833. TripCount, UCE, UP)) {
  834. UP.Count = *UnrollFactor;
  835. UseUpperBound = false;
  836. return ExplicitUnroll;
  837. }
  838. }
  839. // 4th priority is bounded unrolling.
  840. // We can unroll by the upper bound amount if it's generally allowed or if
  841. // we know that the loop is executed either the upper bound or zero times.
  842. // (MaxOrZero unrolling keeps only the first loop test, so the number of
  843. // loop tests remains the same compared to the non-unrolled version, whereas
  844. // the generic upper bound unrolling keeps all but the last loop test so the
  845. // number of loop tests goes up which may end up being worse on targets with
  846. // constrained branch predictor resources so is controlled by an option.)
  847. // In addition we only unroll small upper bounds.
  848. // Note that the cost of bounded unrolling is always strictly greater than
  849. // cost of exact full unrolling. As such, if we have an exact count and
  850. // found it unprofitable, we'll never chose to bounded unroll.
  851. if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) &&
  852. MaxTripCount <= UnrollMaxUpperBound) {
  853. UP.Count = MaxTripCount;
  854. if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
  855. MaxTripCount, UCE, UP)) {
  856. UP.Count = *UnrollFactor;
  857. UseUpperBound = true;
  858. return ExplicitUnroll;
  859. }
  860. }
  861. // 5th priority is loop peeling.
  862. computePeelCount(L, LoopSize, PP, TripCount, DT, SE, UP.Threshold);
  863. if (PP.PeelCount) {
  864. UP.Runtime = false;
  865. UP.Count = 1;
  866. return ExplicitUnroll;
  867. }
  868. // Before starting partial unrolling, set up.partial to true,
  869. // if user explicitly asked for unrolling
  870. if (TripCount)
  871. UP.Partial |= ExplicitUnroll;
  872. // 6th priority is partial unrolling.
  873. // Try partial unroll only when TripCount could be statically calculated.
  874. if (auto UnrollFactor = shouldPartialUnroll(LoopSize, TripCount, UCE, UP)) {
  875. UP.Count = *UnrollFactor;
  876. if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
  877. UP.Count != TripCount)
  878. ORE->emit([&]() {
  879. return OptimizationRemarkMissed(DEBUG_TYPE,
  880. "FullUnrollAsDirectedTooLarge",
  881. L->getStartLoc(), L->getHeader())
  882. << "Unable to fully unroll loop as directed by unroll pragma "
  883. "because "
  884. "unrolled size is too large.";
  885. });
  886. if (UP.PartialThreshold != NoThreshold) {
  887. if (UP.Count == 0) {
  888. if (PragmaEnableUnroll)
  889. ORE->emit([&]() {
  890. return OptimizationRemarkMissed(DEBUG_TYPE,
  891. "UnrollAsDirectedTooLarge",
  892. L->getStartLoc(), L->getHeader())
  893. << "Unable to unroll loop as directed by unroll(enable) "
  894. "pragma "
  895. "because unrolled size is too large.";
  896. });
  897. }
  898. }
  899. return ExplicitUnroll;
  900. }
  901. assert(TripCount == 0 &&
  902. "All cases when TripCount is constant should be covered here.");
  903. if (PragmaFullUnroll)
  904. ORE->emit([&]() {
  905. return OptimizationRemarkMissed(
  906. DEBUG_TYPE, "CantFullUnrollAsDirectedRuntimeTripCount",
  907. L->getStartLoc(), L->getHeader())
  908. << "Unable to fully unroll loop as directed by unroll(full) "
  909. "pragma "
  910. "because loop has a runtime trip count.";
  911. });
  912. // 7th priority is runtime unrolling.
  913. // Don't unroll a runtime trip count loop when it is disabled.
  914. if (hasRuntimeUnrollDisablePragma(L)) {
  915. UP.Count = 0;
  916. return false;
  917. }
  918. // Don't unroll a small upper bound loop unless user or TTI asked to do so.
  919. if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) {
  920. UP.Count = 0;
  921. return false;
  922. }
  923. // Check if the runtime trip count is too small when profile is available.
  924. if (L->getHeader()->getParent()->hasProfileData()) {
  925. if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
  926. if (*ProfileTripCount < FlatLoopTripCountThreshold)
  927. return false;
  928. else
  929. UP.AllowExpensiveTripCount = true;
  930. }
  931. }
  932. UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
  933. if (!UP.Runtime) {
  934. LLVM_DEBUG(
  935. dbgs() << " will not try to unroll loop with runtime trip count "
  936. << "-unroll-runtime not given\n");
  937. UP.Count = 0;
  938. return false;
  939. }
  940. if (UP.Count == 0)
  941. UP.Count = UP.DefaultUnrollRuntimeCount;
  942. // Reduce unroll count to be the largest power-of-two factor of
  943. // the original count which satisfies the threshold limit.
  944. while (UP.Count != 0 &&
  945. UCE.getUnrolledLoopSize(UP) > UP.PartialThreshold)
  946. UP.Count >>= 1;
  947. #ifndef NDEBUG
  948. unsigned OrigCount = UP.Count;
  949. #endif
  950. if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
  951. while (UP.Count != 0 && TripMultiple % UP.Count != 0)
  952. UP.Count >>= 1;
  953. LLVM_DEBUG(
  954. dbgs() << "Remainder loop is restricted (that could architecture "
  955. "specific or because the loop contains a convergent "
  956. "instruction), so unroll count must divide the trip "
  957. "multiple, "
  958. << TripMultiple << ". Reducing unroll count from " << OrigCount
  959. << " to " << UP.Count << ".\n");
  960. using namespace ore;
  961. if (unrollCountPragmaValue(L) > 0 && !UP.AllowRemainder)
  962. ORE->emit([&]() {
  963. return OptimizationRemarkMissed(DEBUG_TYPE,
  964. "DifferentUnrollCountFromDirected",
  965. L->getStartLoc(), L->getHeader())
  966. << "Unable to unroll loop the number of times directed by "
  967. "unroll_count pragma because remainder loop is restricted "
  968. "(that could architecture specific or because the loop "
  969. "contains a convergent instruction) and so must have an "
  970. "unroll "
  971. "count that divides the loop trip multiple of "
  972. << NV("TripMultiple", TripMultiple) << ". Unrolling instead "
  973. << NV("UnrollCount", UP.Count) << " time(s).";
  974. });
  975. }
  976. if (UP.Count > UP.MaxCount)
  977. UP.Count = UP.MaxCount;
  978. if (MaxTripCount && UP.Count > MaxTripCount)
  979. UP.Count = MaxTripCount;
  980. LLVM_DEBUG(dbgs() << " runtime unrolling with count: " << UP.Count
  981. << "\n");
  982. if (UP.Count < 2)
  983. UP.Count = 0;
  984. return ExplicitUnroll;
  985. }
  986. static LoopUnrollResult tryToUnrollLoop(
  987. Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
  988. const TargetTransformInfo &TTI, AssumptionCache &AC,
  989. OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
  990. ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel,
  991. bool OnlyWhenForced, bool ForgetAllSCEV, Optional<unsigned> ProvidedCount,
  992. Optional<unsigned> ProvidedThreshold, Optional<bool> ProvidedAllowPartial,
  993. Optional<bool> ProvidedRuntime, Optional<bool> ProvidedUpperBound,
  994. Optional<bool> ProvidedAllowPeeling,
  995. Optional<bool> ProvidedAllowProfileBasedPeeling,
  996. Optional<unsigned> ProvidedFullUnrollMaxCount) {
  997. LLVM_DEBUG(dbgs() << "Loop Unroll: F["
  998. << L->getHeader()->getParent()->getName() << "] Loop %"
  999. << L->getHeader()->getName() << "\n");
  1000. TransformationMode TM = hasUnrollTransformation(L);
  1001. if (TM & TM_Disable)
  1002. return LoopUnrollResult::Unmodified;
  1003. // If this loop isn't forced to be unrolled, avoid unrolling it when the
  1004. // parent loop has an explicit unroll-and-jam pragma. This is to prevent
  1005. // automatic unrolling from interfering with the user requested
  1006. // transformation.
  1007. Loop *ParentL = L->getParentLoop();
  1008. if (ParentL != nullptr &&
  1009. hasUnrollAndJamTransformation(ParentL) == TM_ForcedByUser &&
  1010. hasUnrollTransformation(L) != TM_ForcedByUser) {
  1011. LLVM_DEBUG(dbgs() << "Not unrolling loop since parent loop has"
  1012. << " llvm.loop.unroll_and_jam.\n");
  1013. return LoopUnrollResult::Unmodified;
  1014. }
  1015. // If this loop isn't forced to be unrolled, avoid unrolling it when the
  1016. // loop has an explicit unroll-and-jam pragma. This is to prevent automatic
  1017. // unrolling from interfering with the user requested transformation.
  1018. if (hasUnrollAndJamTransformation(L) == TM_ForcedByUser &&
  1019. hasUnrollTransformation(L) != TM_ForcedByUser) {
  1020. LLVM_DEBUG(
  1021. dbgs()
  1022. << " Not unrolling loop since it has llvm.loop.unroll_and_jam.\n");
  1023. return LoopUnrollResult::Unmodified;
  1024. }
  1025. if (!L->isLoopSimplifyForm()) {
  1026. LLVM_DEBUG(
  1027. dbgs() << " Not unrolling loop which is not in loop-simplify form.\n");
  1028. return LoopUnrollResult::Unmodified;
  1029. }
  1030. // When automatic unrolling is disabled, do not unroll unless overridden for
  1031. // this loop.
  1032. if (OnlyWhenForced && !(TM & TM_Enable))
  1033. return LoopUnrollResult::Unmodified;
  1034. bool OptForSize = L->getHeader()->getParent()->hasOptSize();
  1035. unsigned NumInlineCandidates;
  1036. bool NotDuplicatable;
  1037. bool Convergent;
  1038. TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
  1039. L, SE, TTI, BFI, PSI, ORE, OptLevel, ProvidedThreshold, ProvidedCount,
  1040. ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
  1041. ProvidedFullUnrollMaxCount);
  1042. TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences(
  1043. L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, true);
  1044. // Exit early if unrolling is disabled. For OptForSize, we pick the loop size
  1045. // as threshold later on.
  1046. if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0) &&
  1047. !OptForSize)
  1048. return LoopUnrollResult::Unmodified;
  1049. SmallPtrSet<const Value *, 32> EphValues;
  1050. CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
  1051. unsigned LoopSize =
  1052. ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
  1053. TTI, EphValues, UP.BEInsns);
  1054. LLVM_DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n");
  1055. if (NotDuplicatable) {
  1056. LLVM_DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable"
  1057. << " instructions.\n");
  1058. return LoopUnrollResult::Unmodified;
  1059. }
  1060. // When optimizing for size, use LoopSize + 1 as threshold (we use < Threshold
  1061. // later), to (fully) unroll loops, if it does not increase code size.
  1062. if (OptForSize)
  1063. UP.Threshold = std::max(UP.Threshold, LoopSize + 1);
  1064. if (NumInlineCandidates != 0) {
  1065. LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
  1066. return LoopUnrollResult::Unmodified;
  1067. }
  1068. // Find the smallest exact trip count for any exit. This is an upper bound
  1069. // on the loop trip count, but an exit at an earlier iteration is still
  1070. // possible. An unroll by the smallest exact trip count guarantees that all
  1071. // brnaches relating to at least one exit can be eliminated. This is unlike
  1072. // the max trip count, which only guarantees that the backedge can be broken.
  1073. unsigned TripCount = 0;
  1074. unsigned TripMultiple = 1;
  1075. SmallVector<BasicBlock *, 8> ExitingBlocks;
  1076. L->getExitingBlocks(ExitingBlocks);
  1077. for (BasicBlock *ExitingBlock : ExitingBlocks)
  1078. if (unsigned TC = SE.getSmallConstantTripCount(L, ExitingBlock))
  1079. if (!TripCount || TC < TripCount)
  1080. TripCount = TripMultiple = TC;
  1081. if (!TripCount) {
  1082. // If no exact trip count is known, determine the trip multiple of either
  1083. // the loop latch or the single exiting block.
  1084. // TODO: Relax for multiple exits.
  1085. BasicBlock *ExitingBlock = L->getLoopLatch();
  1086. if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
  1087. ExitingBlock = L->getExitingBlock();
  1088. if (ExitingBlock)
  1089. TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
  1090. }
  1091. // If the loop contains a convergent operation, the prelude we'd add
  1092. // to do the first few instructions before we hit the unrolled loop
  1093. // is unsafe -- it adds a control-flow dependency to the convergent
  1094. // operation. Therefore restrict remainder loop (try unrolling without).
  1095. //
  1096. // TODO: This is quite conservative. In practice, convergent_op()
  1097. // is likely to be called unconditionally in the loop. In this
  1098. // case, the program would be ill-formed (on most architectures)
  1099. // unless n were the same on all threads in a thread group.
  1100. // Assuming n is the same on all threads, any kind of unrolling is
  1101. // safe. But currently llvm's notion of convergence isn't powerful
  1102. // enough to express this.
  1103. if (Convergent)
  1104. UP.AllowRemainder = false;
  1105. // Try to find the trip count upper bound if we cannot find the exact trip
  1106. // count.
  1107. unsigned MaxTripCount = 0;
  1108. bool MaxOrZero = false;
  1109. if (!TripCount) {
  1110. MaxTripCount = SE.getSmallConstantMaxTripCount(L);
  1111. MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
  1112. }
  1113. // computeUnrollCount() decides whether it is beneficial to use upper bound to
  1114. // fully unroll the loop.
  1115. bool UseUpperBound = false;
  1116. bool IsCountSetExplicitly = computeUnrollCount(
  1117. L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, MaxOrZero,
  1118. TripMultiple, LoopSize, UP, PP, UseUpperBound);
  1119. if (!UP.Count)
  1120. return LoopUnrollResult::Unmodified;
  1121. if (PP.PeelCount) {
  1122. assert(UP.Count == 1 && "Cannot perform peel and unroll in the same step");
  1123. LLVM_DEBUG(dbgs() << "PEELING loop %" << L->getHeader()->getName()
  1124. << " with iteration count " << PP.PeelCount << "!\n");
  1125. ORE.emit([&]() {
  1126. return OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(),
  1127. L->getHeader())
  1128. << " peeled loop by " << ore::NV("PeelCount", PP.PeelCount)
  1129. << " iterations";
  1130. });
  1131. if (peelLoop(L, PP.PeelCount, LI, &SE, DT, &AC, PreserveLCSSA)) {
  1132. simplifyLoopAfterUnroll(L, true, LI, &SE, &DT, &AC, &TTI);
  1133. // If the loop was peeled, we already "used up" the profile information
  1134. // we had, so we don't want to unroll or peel again.
  1135. if (PP.PeelProfiledIterations)
  1136. L->setLoopAlreadyUnrolled();
  1137. return LoopUnrollResult::PartiallyUnrolled;
  1138. }
  1139. return LoopUnrollResult::Unmodified;
  1140. }
  1141. // At this point, UP.Runtime indicates that run-time unrolling is allowed.
  1142. // However, we only want to actually perform it if we don't know the trip
  1143. // count and the unroll count doesn't divide the known trip multiple.
  1144. // TODO: This decision should probably be pushed up into
  1145. // computeUnrollCount().
  1146. UP.Runtime &= TripCount == 0 && TripMultiple % UP.Count != 0;
  1147. // Save loop properties before it is transformed.
  1148. MDNode *OrigLoopID = L->getLoopID();
  1149. // Unroll the loop.
  1150. Loop *RemainderLoop = nullptr;
  1151. LoopUnrollResult UnrollResult = UnrollLoop(
  1152. L,
  1153. {UP.Count, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
  1154. UP.UnrollRemainder, ForgetAllSCEV},
  1155. LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop);
  1156. if (UnrollResult == LoopUnrollResult::Unmodified)
  1157. return LoopUnrollResult::Unmodified;
  1158. if (RemainderLoop) {
  1159. Optional<MDNode *> RemainderLoopID =
  1160. makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
  1161. LLVMLoopUnrollFollowupRemainder});
  1162. if (RemainderLoopID.hasValue())
  1163. RemainderLoop->setLoopID(RemainderLoopID.getValue());
  1164. }
  1165. if (UnrollResult != LoopUnrollResult::FullyUnrolled) {
  1166. Optional<MDNode *> NewLoopID =
  1167. makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
  1168. LLVMLoopUnrollFollowupUnrolled});
  1169. if (NewLoopID.hasValue()) {
  1170. L->setLoopID(NewLoopID.getValue());
  1171. // Do not setLoopAlreadyUnrolled if loop attributes have been specified
  1172. // explicitly.
  1173. return UnrollResult;
  1174. }
  1175. }
  1176. // If loop has an unroll count pragma or unrolled by explicitly set count
  1177. // mark loop as unrolled to prevent unrolling beyond that requested.
  1178. if (UnrollResult != LoopUnrollResult::FullyUnrolled && IsCountSetExplicitly)
  1179. L->setLoopAlreadyUnrolled();
  1180. return UnrollResult;
  1181. }
  1182. namespace {
  1183. class LoopUnroll : public LoopPass {
  1184. public:
  1185. static char ID; // Pass ID, replacement for typeid
  1186. int OptLevel;
  1187. /// If false, use a cost model to determine whether unrolling of a loop is
  1188. /// profitable. If true, only loops that explicitly request unrolling via
  1189. /// metadata are considered. All other loops are skipped.
  1190. bool OnlyWhenForced;
  1191. /// If false, when SCEV is invalidated, only forget everything in the
  1192. /// top-most loop (call forgetTopMostLoop), of the loop being processed.
  1193. /// Otherwise, forgetAllLoops and rebuild when needed next.
  1194. bool ForgetAllSCEV;
  1195. Optional<unsigned> ProvidedCount;
  1196. Optional<unsigned> ProvidedThreshold;
  1197. Optional<bool> ProvidedAllowPartial;
  1198. Optional<bool> ProvidedRuntime;
  1199. Optional<bool> ProvidedUpperBound;
  1200. Optional<bool> ProvidedAllowPeeling;
  1201. Optional<bool> ProvidedAllowProfileBasedPeeling;
  1202. Optional<unsigned> ProvidedFullUnrollMaxCount;
  1203. LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false,
  1204. bool ForgetAllSCEV = false, Optional<unsigned> Threshold = None,
  1205. Optional<unsigned> Count = None,
  1206. Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
  1207. Optional<bool> UpperBound = None,
  1208. Optional<bool> AllowPeeling = None,
  1209. Optional<bool> AllowProfileBasedPeeling = None,
  1210. Optional<unsigned> ProvidedFullUnrollMaxCount = None)
  1211. : LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
  1212. ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(std::move(Count)),
  1213. ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
  1214. ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
  1215. ProvidedAllowPeeling(AllowPeeling),
  1216. ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling),
  1217. ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) {
  1218. initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
  1219. }
  1220. bool runOnLoop(Loop *L, LPPassManager &LPM) override {
  1221. if (skipLoop(L))
  1222. return false;
  1223. Function &F = *L->getHeader()->getParent();
  1224. auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  1225. LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  1226. ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  1227. const TargetTransformInfo &TTI =
  1228. getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
  1229. auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
  1230. // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
  1231. // pass. Function analyses need to be preserved across loop transformations
  1232. // but ORE cannot be preserved (see comment before the pass definition).
  1233. OptimizationRemarkEmitter ORE(&F);
  1234. bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
  1235. LoopUnrollResult Result = tryToUnrollLoop(
  1236. L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, PreserveLCSSA, OptLevel,
  1237. OnlyWhenForced, ForgetAllSCEV, ProvidedCount, ProvidedThreshold,
  1238. ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
  1239. ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling,
  1240. ProvidedFullUnrollMaxCount);
  1241. if (Result == LoopUnrollResult::FullyUnrolled)
  1242. LPM.markLoopAsDeleted(*L);
  1243. return Result != LoopUnrollResult::Unmodified;
  1244. }
  1245. /// This transformation requires natural loop information & requires that
  1246. /// loop preheaders be inserted into the CFG...
  1247. void getAnalysisUsage(AnalysisUsage &AU) const override {
  1248. AU.addRequired<AssumptionCacheTracker>();
  1249. AU.addRequired<TargetTransformInfoWrapperPass>();
  1250. // FIXME: Loop passes are required to preserve domtree, and for now we just
  1251. // recreate dom info if anything gets unrolled.
  1252. getLoopAnalysisUsage(AU);
  1253. }
  1254. };
  1255. } // end anonymous namespace
  1256. char LoopUnroll::ID = 0;
  1257. INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
  1258. INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
  1259. INITIALIZE_PASS_DEPENDENCY(LoopPass)
  1260. INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
  1261. INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
  1262. Pass *llvm::createLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
  1263. bool ForgetAllSCEV, int Threshold, int Count,
  1264. int AllowPartial, int Runtime, int UpperBound,
  1265. int AllowPeeling) {
  1266. // TODO: It would make more sense for this function to take the optionals
  1267. // directly, but that's dangerous since it would silently break out of tree
  1268. // callers.
  1269. return new LoopUnroll(
  1270. OptLevel, OnlyWhenForced, ForgetAllSCEV,
  1271. Threshold == -1 ? None : Optional<unsigned>(Threshold),
  1272. Count == -1 ? None : Optional<unsigned>(Count),
  1273. AllowPartial == -1 ? None : Optional<bool>(AllowPartial),
  1274. Runtime == -1 ? None : Optional<bool>(Runtime),
  1275. UpperBound == -1 ? None : Optional<bool>(UpperBound),
  1276. AllowPeeling == -1 ? None : Optional<bool>(AllowPeeling));
  1277. }
  1278. Pass *llvm::createSimpleLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
  1279. bool ForgetAllSCEV) {
  1280. return createLoopUnrollPass(OptLevel, OnlyWhenForced, ForgetAllSCEV, -1, -1,
  1281. 0, 0, 0, 1);
  1282. }
  1283. PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
  1284. LoopStandardAnalysisResults &AR,
  1285. LPMUpdater &Updater) {
  1286. // For the new PM, we can't use OptimizationRemarkEmitter as an analysis
  1287. // pass. Function analyses need to be preserved across loop transformations
  1288. // but ORE cannot be preserved (see comment before the pass definition).
  1289. OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
  1290. // Keep track of the previous loop structure so we can identify new loops
  1291. // created by unrolling.
  1292. Loop *ParentL = L.getParentLoop();
  1293. SmallPtrSet<Loop *, 4> OldLoops;
  1294. if (ParentL)
  1295. OldLoops.insert(ParentL->begin(), ParentL->end());
  1296. else
  1297. OldLoops.insert(AR.LI.begin(), AR.LI.end());
  1298. std::string LoopName = std::string(L.getName());
  1299. bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, ORE,
  1300. /*BFI*/ nullptr, /*PSI*/ nullptr,
  1301. /*PreserveLCSSA*/ true, OptLevel,
  1302. OnlyWhenForced, ForgetSCEV, /*Count*/ None,
  1303. /*Threshold*/ None, /*AllowPartial*/ false,
  1304. /*Runtime*/ false, /*UpperBound*/ false,
  1305. /*AllowPeeling*/ true,
  1306. /*AllowProfileBasedPeeling*/ false,
  1307. /*FullUnrollMaxCount*/ None) !=
  1308. LoopUnrollResult::Unmodified;
  1309. if (!Changed)
  1310. return PreservedAnalyses::all();
  1311. // The parent must not be damaged by unrolling!
  1312. #ifndef NDEBUG
  1313. if (ParentL)
  1314. ParentL->verifyLoop();
  1315. #endif
  1316. // Unrolling can do several things to introduce new loops into a loop nest:
  1317. // - Full unrolling clones child loops within the current loop but then
  1318. // removes the current loop making all of the children appear to be new
  1319. // sibling loops.
  1320. //
  1321. // When a new loop appears as a sibling loop after fully unrolling,
  1322. // its nesting structure has fundamentally changed and we want to revisit
  1323. // it to reflect that.
  1324. //
  1325. // When unrolling has removed the current loop, we need to tell the
  1326. // infrastructure that it is gone.
  1327. //
  1328. // Finally, we support a debugging/testing mode where we revisit child loops
  1329. // as well. These are not expected to require further optimizations as either
  1330. // they or the loop they were cloned from have been directly visited already.
  1331. // But the debugging mode allows us to check this assumption.
  1332. bool IsCurrentLoopValid = false;
  1333. SmallVector<Loop *, 4> SibLoops;
  1334. if (ParentL)
  1335. SibLoops.append(ParentL->begin(), ParentL->end());
  1336. else
  1337. SibLoops.append(AR.LI.begin(), AR.LI.end());
  1338. erase_if(SibLoops, [&](Loop *SibLoop) {
  1339. if (SibLoop == &L) {
  1340. IsCurrentLoopValid = true;
  1341. return true;
  1342. }
  1343. // Otherwise erase the loop from the list if it was in the old loops.
  1344. return OldLoops.contains(SibLoop);
  1345. });
  1346. Updater.addSiblingLoops(SibLoops);
  1347. if (!IsCurrentLoopValid) {
  1348. Updater.markLoopAsDeleted(L, LoopName);
  1349. } else {
  1350. // We can only walk child loops if the current loop remained valid.
  1351. if (UnrollRevisitChildLoops) {
  1352. // Walk *all* of the child loops.
  1353. SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
  1354. Updater.addChildLoops(ChildLoops);
  1355. }
  1356. }
  1357. return getLoopPassPreservedAnalyses();
  1358. }
  1359. PreservedAnalyses LoopUnrollPass::run(Function &F,
  1360. FunctionAnalysisManager &AM) {
  1361. auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
  1362. auto &LI = AM.getResult<LoopAnalysis>(F);
  1363. auto &TTI = AM.getResult<TargetIRAnalysis>(F);
  1364. auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
  1365. auto &AC = AM.getResult<AssumptionAnalysis>(F);
  1366. auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
  1367. LoopAnalysisManager *LAM = nullptr;
  1368. if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
  1369. LAM = &LAMProxy->getManager();
  1370. auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
  1371. ProfileSummaryInfo *PSI =
  1372. MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
  1373. auto *BFI = (PSI && PSI->hasProfileSummary()) ?
  1374. &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
  1375. bool Changed = false;
  1376. // The unroller requires loops to be in simplified form, and also needs LCSSA.
  1377. // Since simplification may add new inner loops, it has to run before the
  1378. // legality and profitability checks. This means running the loop unroller
  1379. // will simplify all loops, regardless of whether anything end up being
  1380. // unrolled.
  1381. for (auto &L : LI) {
  1382. Changed |=
  1383. simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
  1384. Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
  1385. }
  1386. // Add the loop nests in the reverse order of LoopInfo. See method
  1387. // declaration.
  1388. SmallPriorityWorklist<Loop *, 4> Worklist;
  1389. appendLoopsToWorklist(LI, Worklist);
  1390. while (!Worklist.empty()) {
  1391. // Because the LoopInfo stores the loops in RPO, we walk the worklist
  1392. // from back to front so that we work forward across the CFG, which
  1393. // for unrolling is only needed to get optimization remarks emitted in
  1394. // a forward order.
  1395. Loop &L = *Worklist.pop_back_val();
  1396. #ifndef NDEBUG
  1397. Loop *ParentL = L.getParentLoop();
  1398. #endif
  1399. // Check if the profile summary indicates that the profiled application
  1400. // has a huge working set size, in which case we disable peeling to avoid
  1401. // bloating it further.
  1402. Optional<bool> LocalAllowPeeling = UnrollOpts.AllowPeeling;
  1403. if (PSI && PSI->hasHugeWorkingSetSize())
  1404. LocalAllowPeeling = false;
  1405. std::string LoopName = std::string(L.getName());
  1406. // The API here is quite complex to call and we allow to select some
  1407. // flavors of unrolling during construction time (by setting UnrollOpts).
  1408. LoopUnrollResult Result = tryToUnrollLoop(
  1409. &L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI,
  1410. /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced,
  1411. UnrollOpts.ForgetSCEV, /*Count*/ None,
  1412. /*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime,
  1413. UnrollOpts.AllowUpperBound, LocalAllowPeeling,
  1414. UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount);
  1415. Changed |= Result != LoopUnrollResult::Unmodified;
  1416. // The parent must not be damaged by unrolling!
  1417. #ifndef NDEBUG
  1418. if (Result != LoopUnrollResult::Unmodified && ParentL)
  1419. ParentL->verifyLoop();
  1420. #endif
  1421. // Clear any cached analysis results for L if we removed it completely.
  1422. if (LAM && Result == LoopUnrollResult::FullyUnrolled)
  1423. LAM->clear(L, LoopName);
  1424. }
  1425. if (!Changed)
  1426. return PreservedAnalyses::all();
  1427. return getLoopPassPreservedAnalyses();
  1428. }
  1429. void LoopUnrollPass::printPipeline(
  1430. raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
  1431. static_cast<PassInfoMixin<LoopUnrollPass> *>(this)->printPipeline(
  1432. OS, MapClassName2PassName);
  1433. OS << "<";
  1434. if (UnrollOpts.AllowPartial != None)
  1435. OS << (UnrollOpts.AllowPartial.getValue() ? "" : "no-") << "partial;";
  1436. if (UnrollOpts.AllowPeeling != None)
  1437. OS << (UnrollOpts.AllowPeeling.getValue() ? "" : "no-") << "peeling;";
  1438. if (UnrollOpts.AllowRuntime != None)
  1439. OS << (UnrollOpts.AllowRuntime.getValue() ? "" : "no-") << "runtime;";
  1440. if (UnrollOpts.AllowUpperBound != None)
  1441. OS << (UnrollOpts.AllowUpperBound.getValue() ? "" : "no-") << "upperbound;";
  1442. if (UnrollOpts.AllowProfileBasedPeeling != None)
  1443. OS << (UnrollOpts.AllowProfileBasedPeeling.getValue() ? "" : "no-")
  1444. << "profile-peeling;";
  1445. if (UnrollOpts.FullUnrollMaxCount != None)
  1446. OS << "full-unroll-max=" << UnrollOpts.FullUnrollMaxCount << ";";
  1447. OS << "O" << UnrollOpts.OptLevel;
  1448. OS << ">";
  1449. }