BlockFrequencyInfoImpl.h 68 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929
  1. #pragma once
  2. #ifdef __GNUC__
  3. #pragma GCC diagnostic push
  4. #pragma GCC diagnostic ignored "-Wunused-parameter"
  5. #endif
  6. //==- BlockFrequencyInfoImpl.h - Block Frequency Implementation --*- C++ -*-==//
  7. //
  8. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  9. // See https://llvm.org/LICENSE.txt for license information.
  10. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  11. //
  12. //===----------------------------------------------------------------------===//
  13. //
  14. // Shared implementation of BlockFrequency for IR and Machine Instructions.
  15. // See the documentation below for BlockFrequencyInfoImpl for details.
  16. //
  17. //===----------------------------------------------------------------------===//
  18. #ifndef LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
  19. #define LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
  20. #include "llvm/ADT/BitVector.h"
  21. #include "llvm/ADT/DenseMap.h"
  22. #include "llvm/ADT/DenseSet.h"
  23. #include "llvm/ADT/GraphTraits.h"
  24. #include "llvm/ADT/PostOrderIterator.h"
  25. #include "llvm/ADT/SmallPtrSet.h"
  26. #include "llvm/ADT/SmallVector.h"
  27. #include "llvm/ADT/SparseBitVector.h"
  28. #include "llvm/ADT/Twine.h"
  29. #include "llvm/ADT/iterator_range.h"
  30. #include "llvm/IR/BasicBlock.h"
  31. #include "llvm/IR/ValueHandle.h"
  32. #include "llvm/Support/BlockFrequency.h"
  33. #include "llvm/Support/BranchProbability.h"
  34. #include "llvm/Support/CommandLine.h"
  35. #include "llvm/Support/DOTGraphTraits.h"
  36. #include "llvm/Support/Debug.h"
  37. #include "llvm/Support/Format.h"
  38. #include "llvm/Support/ScaledNumber.h"
  39. #include "llvm/Support/raw_ostream.h"
  40. #include <algorithm>
  41. #include <cassert>
  42. #include <cstddef>
  43. #include <cstdint>
  44. #include <deque>
  45. #include <iterator>
  46. #include <limits>
  47. #include <list>
  48. #include <optional>
  49. #include <queue>
  50. #include <string>
  51. #include <utility>
  52. #include <vector>
  53. #define DEBUG_TYPE "block-freq"
  54. namespace llvm {
  55. extern llvm::cl::opt<bool> CheckBFIUnknownBlockQueries;
  56. extern llvm::cl::opt<bool> UseIterativeBFIInference;
  57. extern llvm::cl::opt<unsigned> IterativeBFIMaxIterationsPerBlock;
  58. extern llvm::cl::opt<double> IterativeBFIPrecision;
  59. class BranchProbabilityInfo;
  60. class Function;
  61. class Loop;
  62. class LoopInfo;
  63. class MachineBasicBlock;
  64. class MachineBranchProbabilityInfo;
  65. class MachineFunction;
  66. class MachineLoop;
  67. class MachineLoopInfo;
  68. namespace bfi_detail {
  69. struct IrreducibleGraph;
  70. // This is part of a workaround for a GCC 4.7 crash on lambdas.
  71. template <class BT> struct BlockEdgesAdder;
  72. /// Mass of a block.
  73. ///
  74. /// This class implements a sort of fixed-point fraction always between 0.0 and
  75. /// 1.0. getMass() == std::numeric_limits<uint64_t>::max() indicates a value of
  76. /// 1.0.
  77. ///
  78. /// Masses can be added and subtracted. Simple saturation arithmetic is used,
  79. /// so arithmetic operations never overflow or underflow.
  80. ///
  81. /// Masses can be multiplied. Multiplication treats full mass as 1.0 and uses
  82. /// an inexpensive floating-point algorithm that's off-by-one (almost, but not
  83. /// quite, maximum precision).
  84. ///
  85. /// Masses can be scaled by \a BranchProbability at maximum precision.
  86. class BlockMass {
  87. uint64_t Mass = 0;
  88. public:
  89. BlockMass() = default;
  90. explicit BlockMass(uint64_t Mass) : Mass(Mass) {}
  91. static BlockMass getEmpty() { return BlockMass(); }
  92. static BlockMass getFull() {
  93. return BlockMass(std::numeric_limits<uint64_t>::max());
  94. }
  95. uint64_t getMass() const { return Mass; }
  96. bool isFull() const { return Mass == std::numeric_limits<uint64_t>::max(); }
  97. bool isEmpty() const { return !Mass; }
  98. bool operator!() const { return isEmpty(); }
  99. /// Add another mass.
  100. ///
  101. /// Adds another mass, saturating at \a isFull() rather than overflowing.
  102. BlockMass &operator+=(BlockMass X) {
  103. uint64_t Sum = Mass + X.Mass;
  104. Mass = Sum < Mass ? std::numeric_limits<uint64_t>::max() : Sum;
  105. return *this;
  106. }
  107. /// Subtract another mass.
  108. ///
  109. /// Subtracts another mass, saturating at \a isEmpty() rather than
  110. /// undeflowing.
  111. BlockMass &operator-=(BlockMass X) {
  112. uint64_t Diff = Mass - X.Mass;
  113. Mass = Diff > Mass ? 0 : Diff;
  114. return *this;
  115. }
  116. BlockMass &operator*=(BranchProbability P) {
  117. Mass = P.scale(Mass);
  118. return *this;
  119. }
  120. bool operator==(BlockMass X) const { return Mass == X.Mass; }
  121. bool operator!=(BlockMass X) const { return Mass != X.Mass; }
  122. bool operator<=(BlockMass X) const { return Mass <= X.Mass; }
  123. bool operator>=(BlockMass X) const { return Mass >= X.Mass; }
  124. bool operator<(BlockMass X) const { return Mass < X.Mass; }
  125. bool operator>(BlockMass X) const { return Mass > X.Mass; }
  126. /// Convert to scaled number.
  127. ///
  128. /// Convert to \a ScaledNumber. \a isFull() gives 1.0, while \a isEmpty()
  129. /// gives slightly above 0.0.
  130. ScaledNumber<uint64_t> toScaled() const;
  131. void dump() const;
  132. raw_ostream &print(raw_ostream &OS) const;
  133. };
  134. inline BlockMass operator+(BlockMass L, BlockMass R) {
  135. return BlockMass(L) += R;
  136. }
  137. inline BlockMass operator-(BlockMass L, BlockMass R) {
  138. return BlockMass(L) -= R;
  139. }
  140. inline BlockMass operator*(BlockMass L, BranchProbability R) {
  141. return BlockMass(L) *= R;
  142. }
  143. inline BlockMass operator*(BranchProbability L, BlockMass R) {
  144. return BlockMass(R) *= L;
  145. }
  146. inline raw_ostream &operator<<(raw_ostream &OS, BlockMass X) {
  147. return X.print(OS);
  148. }
  149. } // end namespace bfi_detail
  150. /// Base class for BlockFrequencyInfoImpl
  151. ///
  152. /// BlockFrequencyInfoImplBase has supporting data structures and some
  153. /// algorithms for BlockFrequencyInfoImplBase. Only algorithms that depend on
  154. /// the block type (or that call such algorithms) are skipped here.
  155. ///
  156. /// Nevertheless, the majority of the overall algorithm documentation lives with
  157. /// BlockFrequencyInfoImpl. See there for details.
  158. class BlockFrequencyInfoImplBase {
  159. public:
  160. using Scaled64 = ScaledNumber<uint64_t>;
  161. using BlockMass = bfi_detail::BlockMass;
  162. /// Representative of a block.
  163. ///
  164. /// This is a simple wrapper around an index into the reverse-post-order
  165. /// traversal of the blocks.
  166. ///
  167. /// Unlike a block pointer, its order has meaning (location in the
  168. /// topological sort) and it's class is the same regardless of block type.
  169. struct BlockNode {
  170. using IndexType = uint32_t;
  171. IndexType Index;
  172. BlockNode() : Index(std::numeric_limits<uint32_t>::max()) {}
  173. BlockNode(IndexType Index) : Index(Index) {}
  174. bool operator==(const BlockNode &X) const { return Index == X.Index; }
  175. bool operator!=(const BlockNode &X) const { return Index != X.Index; }
  176. bool operator<=(const BlockNode &X) const { return Index <= X.Index; }
  177. bool operator>=(const BlockNode &X) const { return Index >= X.Index; }
  178. bool operator<(const BlockNode &X) const { return Index < X.Index; }
  179. bool operator>(const BlockNode &X) const { return Index > X.Index; }
  180. bool isValid() const { return Index <= getMaxIndex(); }
  181. static size_t getMaxIndex() {
  182. return std::numeric_limits<uint32_t>::max() - 1;
  183. }
  184. };
  185. /// Stats about a block itself.
  186. struct FrequencyData {
  187. Scaled64 Scaled;
  188. uint64_t Integer;
  189. };
  190. /// Data about a loop.
  191. ///
  192. /// Contains the data necessary to represent a loop as a pseudo-node once it's
  193. /// packaged.
  194. struct LoopData {
  195. using ExitMap = SmallVector<std::pair<BlockNode, BlockMass>, 4>;
  196. using NodeList = SmallVector<BlockNode, 4>;
  197. using HeaderMassList = SmallVector<BlockMass, 1>;
  198. LoopData *Parent; ///< The parent loop.
  199. bool IsPackaged = false; ///< Whether this has been packaged.
  200. uint32_t NumHeaders = 1; ///< Number of headers.
  201. ExitMap Exits; ///< Successor edges (and weights).
  202. NodeList Nodes; ///< Header and the members of the loop.
  203. HeaderMassList BackedgeMass; ///< Mass returned to each loop header.
  204. BlockMass Mass;
  205. Scaled64 Scale;
  206. LoopData(LoopData *Parent, const BlockNode &Header)
  207. : Parent(Parent), Nodes(1, Header), BackedgeMass(1) {}
  208. template <class It1, class It2>
  209. LoopData(LoopData *Parent, It1 FirstHeader, It1 LastHeader, It2 FirstOther,
  210. It2 LastOther)
  211. : Parent(Parent), Nodes(FirstHeader, LastHeader) {
  212. NumHeaders = Nodes.size();
  213. Nodes.insert(Nodes.end(), FirstOther, LastOther);
  214. BackedgeMass.resize(NumHeaders);
  215. }
  216. bool isHeader(const BlockNode &Node) const {
  217. if (isIrreducible())
  218. return std::binary_search(Nodes.begin(), Nodes.begin() + NumHeaders,
  219. Node);
  220. return Node == Nodes[0];
  221. }
  222. BlockNode getHeader() const { return Nodes[0]; }
  223. bool isIrreducible() const { return NumHeaders > 1; }
  224. HeaderMassList::difference_type getHeaderIndex(const BlockNode &B) {
  225. assert(isHeader(B) && "this is only valid on loop header blocks");
  226. if (isIrreducible())
  227. return std::lower_bound(Nodes.begin(), Nodes.begin() + NumHeaders, B) -
  228. Nodes.begin();
  229. return 0;
  230. }
  231. NodeList::const_iterator members_begin() const {
  232. return Nodes.begin() + NumHeaders;
  233. }
  234. NodeList::const_iterator members_end() const { return Nodes.end(); }
  235. iterator_range<NodeList::const_iterator> members() const {
  236. return make_range(members_begin(), members_end());
  237. }
  238. };
  239. /// Index of loop information.
  240. struct WorkingData {
  241. BlockNode Node; ///< This node.
  242. LoopData *Loop = nullptr; ///< The loop this block is inside.
  243. BlockMass Mass; ///< Mass distribution from the entry block.
  244. WorkingData(const BlockNode &Node) : Node(Node) {}
  245. bool isLoopHeader() const { return Loop && Loop->isHeader(Node); }
  246. bool isDoubleLoopHeader() const {
  247. return isLoopHeader() && Loop->Parent && Loop->Parent->isIrreducible() &&
  248. Loop->Parent->isHeader(Node);
  249. }
  250. LoopData *getContainingLoop() const {
  251. if (!isLoopHeader())
  252. return Loop;
  253. if (!isDoubleLoopHeader())
  254. return Loop->Parent;
  255. return Loop->Parent->Parent;
  256. }
  257. /// Resolve a node to its representative.
  258. ///
  259. /// Get the node currently representing Node, which could be a containing
  260. /// loop.
  261. ///
  262. /// This function should only be called when distributing mass. As long as
  263. /// there are no irreducible edges to Node, then it will have complexity
  264. /// O(1) in this context.
  265. ///
  266. /// In general, the complexity is O(L), where L is the number of loop
  267. /// headers Node has been packaged into. Since this method is called in
  268. /// the context of distributing mass, L will be the number of loop headers
  269. /// an early exit edge jumps out of.
  270. BlockNode getResolvedNode() const {
  271. auto L = getPackagedLoop();
  272. return L ? L->getHeader() : Node;
  273. }
  274. LoopData *getPackagedLoop() const {
  275. if (!Loop || !Loop->IsPackaged)
  276. return nullptr;
  277. auto L = Loop;
  278. while (L->Parent && L->Parent->IsPackaged)
  279. L = L->Parent;
  280. return L;
  281. }
  282. /// Get the appropriate mass for a node.
  283. ///
  284. /// Get appropriate mass for Node. If Node is a loop-header (whose loop
  285. /// has been packaged), returns the mass of its pseudo-node. If it's a
  286. /// node inside a packaged loop, it returns the loop's mass.
  287. BlockMass &getMass() {
  288. if (!isAPackage())
  289. return Mass;
  290. if (!isADoublePackage())
  291. return Loop->Mass;
  292. return Loop->Parent->Mass;
  293. }
  294. /// Has ContainingLoop been packaged up?
  295. bool isPackaged() const { return getResolvedNode() != Node; }
  296. /// Has Loop been packaged up?
  297. bool isAPackage() const { return isLoopHeader() && Loop->IsPackaged; }
  298. /// Has Loop been packaged up twice?
  299. bool isADoublePackage() const {
  300. return isDoubleLoopHeader() && Loop->Parent->IsPackaged;
  301. }
  302. };
  303. /// Unscaled probability weight.
  304. ///
  305. /// Probability weight for an edge in the graph (including the
  306. /// successor/target node).
  307. ///
  308. /// All edges in the original function are 32-bit. However, exit edges from
  309. /// loop packages are taken from 64-bit exit masses, so we need 64-bits of
  310. /// space in general.
  311. ///
  312. /// In addition to the raw weight amount, Weight stores the type of the edge
  313. /// in the current context (i.e., the context of the loop being processed).
  314. /// Is this a local edge within the loop, an exit from the loop, or a
  315. /// backedge to the loop header?
  316. struct Weight {
  317. enum DistType { Local, Exit, Backedge };
  318. DistType Type = Local;
  319. BlockNode TargetNode;
  320. uint64_t Amount = 0;
  321. Weight() = default;
  322. Weight(DistType Type, BlockNode TargetNode, uint64_t Amount)
  323. : Type(Type), TargetNode(TargetNode), Amount(Amount) {}
  324. };
  325. /// Distribution of unscaled probability weight.
  326. ///
  327. /// Distribution of unscaled probability weight to a set of successors.
  328. ///
  329. /// This class collates the successor edge weights for later processing.
  330. ///
  331. /// \a DidOverflow indicates whether \a Total did overflow while adding to
  332. /// the distribution. It should never overflow twice.
  333. struct Distribution {
  334. using WeightList = SmallVector<Weight, 4>;
  335. WeightList Weights; ///< Individual successor weights.
  336. uint64_t Total = 0; ///< Sum of all weights.
  337. bool DidOverflow = false; ///< Whether \a Total did overflow.
  338. Distribution() = default;
  339. void addLocal(const BlockNode &Node, uint64_t Amount) {
  340. add(Node, Amount, Weight::Local);
  341. }
  342. void addExit(const BlockNode &Node, uint64_t Amount) {
  343. add(Node, Amount, Weight::Exit);
  344. }
  345. void addBackedge(const BlockNode &Node, uint64_t Amount) {
  346. add(Node, Amount, Weight::Backedge);
  347. }
  348. /// Normalize the distribution.
  349. ///
  350. /// Combines multiple edges to the same \a Weight::TargetNode and scales
  351. /// down so that \a Total fits into 32-bits.
  352. ///
  353. /// This is linear in the size of \a Weights. For the vast majority of
  354. /// cases, adjacent edge weights are combined by sorting WeightList and
  355. /// combining adjacent weights. However, for very large edge lists an
  356. /// auxiliary hash table is used.
  357. void normalize();
  358. private:
  359. void add(const BlockNode &Node, uint64_t Amount, Weight::DistType Type);
  360. };
  361. /// Data about each block. This is used downstream.
  362. std::vector<FrequencyData> Freqs;
  363. /// Whether each block is an irreducible loop header.
  364. /// This is used downstream.
  365. SparseBitVector<> IsIrrLoopHeader;
  366. /// Loop data: see initializeLoops().
  367. std::vector<WorkingData> Working;
  368. /// Indexed information about loops.
  369. std::list<LoopData> Loops;
  370. /// Virtual destructor.
  371. ///
  372. /// Need a virtual destructor to mask the compiler warning about
  373. /// getBlockName().
  374. virtual ~BlockFrequencyInfoImplBase() = default;
  375. /// Add all edges out of a packaged loop to the distribution.
  376. ///
  377. /// Adds all edges from LocalLoopHead to Dist. Calls addToDist() to add each
  378. /// successor edge.
  379. ///
  380. /// \return \c true unless there's an irreducible backedge.
  381. bool addLoopSuccessorsToDist(const LoopData *OuterLoop, LoopData &Loop,
  382. Distribution &Dist);
  383. /// Add an edge to the distribution.
  384. ///
  385. /// Adds an edge to Succ to Dist. If \c LoopHead.isValid(), then whether the
  386. /// edge is local/exit/backedge is in the context of LoopHead. Otherwise,
  387. /// every edge should be a local edge (since all the loops are packaged up).
  388. ///
  389. /// \return \c true unless aborted due to an irreducible backedge.
  390. bool addToDist(Distribution &Dist, const LoopData *OuterLoop,
  391. const BlockNode &Pred, const BlockNode &Succ, uint64_t Weight);
  392. /// Analyze irreducible SCCs.
  393. ///
  394. /// Separate irreducible SCCs from \c G, which is an explicit graph of \c
  395. /// OuterLoop (or the top-level function, if \c OuterLoop is \c nullptr).
  396. /// Insert them into \a Loops before \c Insert.
  397. ///
  398. /// \return the \c LoopData nodes representing the irreducible SCCs.
  399. iterator_range<std::list<LoopData>::iterator>
  400. analyzeIrreducible(const bfi_detail::IrreducibleGraph &G, LoopData *OuterLoop,
  401. std::list<LoopData>::iterator Insert);
  402. /// Update a loop after packaging irreducible SCCs inside of it.
  403. ///
  404. /// Update \c OuterLoop. Before finding irreducible control flow, it was
  405. /// partway through \a computeMassInLoop(), so \a LoopData::Exits and \a
  406. /// LoopData::BackedgeMass need to be reset. Also, nodes that were packaged
  407. /// up need to be removed from \a OuterLoop::Nodes.
  408. void updateLoopWithIrreducible(LoopData &OuterLoop);
  409. /// Distribute mass according to a distribution.
  410. ///
  411. /// Distributes the mass in Source according to Dist. If LoopHead.isValid(),
  412. /// backedges and exits are stored in its entry in Loops.
  413. ///
  414. /// Mass is distributed in parallel from two copies of the source mass.
  415. void distributeMass(const BlockNode &Source, LoopData *OuterLoop,
  416. Distribution &Dist);
  417. /// Compute the loop scale for a loop.
  418. void computeLoopScale(LoopData &Loop);
  419. /// Adjust the mass of all headers in an irreducible loop.
  420. ///
  421. /// Initially, irreducible loops are assumed to distribute their mass
  422. /// equally among its headers. This can lead to wrong frequency estimates
  423. /// since some headers may be executed more frequently than others.
  424. ///
  425. /// This adjusts header mass distribution so it matches the weights of
  426. /// the backedges going into each of the loop headers.
  427. void adjustLoopHeaderMass(LoopData &Loop);
  428. void distributeIrrLoopHeaderMass(Distribution &Dist);
  429. /// Package up a loop.
  430. void packageLoop(LoopData &Loop);
  431. /// Unwrap loops.
  432. void unwrapLoops();
  433. /// Finalize frequency metrics.
  434. ///
  435. /// Calculates final frequencies and cleans up no-longer-needed data
  436. /// structures.
  437. void finalizeMetrics();
  438. /// Clear all memory.
  439. void clear();
  440. virtual std::string getBlockName(const BlockNode &Node) const;
  441. std::string getLoopName(const LoopData &Loop) const;
  442. virtual raw_ostream &print(raw_ostream &OS) const { return OS; }
  443. void dump() const { print(dbgs()); }
  444. Scaled64 getFloatingBlockFreq(const BlockNode &Node) const;
  445. BlockFrequency getBlockFreq(const BlockNode &Node) const;
  446. std::optional<uint64_t>
  447. getBlockProfileCount(const Function &F, const BlockNode &Node,
  448. bool AllowSynthetic = false) const;
  449. std::optional<uint64_t>
  450. getProfileCountFromFreq(const Function &F, uint64_t Freq,
  451. bool AllowSynthetic = false) const;
  452. bool isIrrLoopHeader(const BlockNode &Node);
  453. void setBlockFreq(const BlockNode &Node, uint64_t Freq);
  454. raw_ostream &printBlockFreq(raw_ostream &OS, const BlockNode &Node) const;
  455. raw_ostream &printBlockFreq(raw_ostream &OS,
  456. const BlockFrequency &Freq) const;
  457. uint64_t getEntryFreq() const {
  458. assert(!Freqs.empty());
  459. return Freqs[0].Integer;
  460. }
  461. };
  462. namespace bfi_detail {
  463. template <class BlockT> struct TypeMap {};
  464. template <> struct TypeMap<BasicBlock> {
  465. using BlockT = BasicBlock;
  466. using BlockKeyT = AssertingVH<const BasicBlock>;
  467. using FunctionT = Function;
  468. using BranchProbabilityInfoT = BranchProbabilityInfo;
  469. using LoopT = Loop;
  470. using LoopInfoT = LoopInfo;
  471. };
  472. template <> struct TypeMap<MachineBasicBlock> {
  473. using BlockT = MachineBasicBlock;
  474. using BlockKeyT = const MachineBasicBlock *;
  475. using FunctionT = MachineFunction;
  476. using BranchProbabilityInfoT = MachineBranchProbabilityInfo;
  477. using LoopT = MachineLoop;
  478. using LoopInfoT = MachineLoopInfo;
  479. };
  480. template <class BlockT, class BFIImplT>
  481. class BFICallbackVH;
  482. /// Get the name of a MachineBasicBlock.
  483. ///
  484. /// Get the name of a MachineBasicBlock. It's templated so that including from
  485. /// CodeGen is unnecessary (that would be a layering issue).
  486. ///
  487. /// This is used mainly for debug output. The name is similar to
  488. /// MachineBasicBlock::getFullName(), but skips the name of the function.
  489. template <class BlockT> std::string getBlockName(const BlockT *BB) {
  490. assert(BB && "Unexpected nullptr");
  491. auto MachineName = "BB" + Twine(BB->getNumber());
  492. if (BB->getBasicBlock())
  493. return (MachineName + "[" + BB->getName() + "]").str();
  494. return MachineName.str();
  495. }
  496. /// Get the name of a BasicBlock.
  497. template <> inline std::string getBlockName(const BasicBlock *BB) {
  498. assert(BB && "Unexpected nullptr");
  499. return BB->getName().str();
  500. }
  501. /// Graph of irreducible control flow.
  502. ///
  503. /// This graph is used for determining the SCCs in a loop (or top-level
  504. /// function) that has irreducible control flow.
  505. ///
  506. /// During the block frequency algorithm, the local graphs are defined in a
  507. /// light-weight way, deferring to the \a BasicBlock or \a MachineBasicBlock
  508. /// graphs for most edges, but getting others from \a LoopData::ExitMap. The
  509. /// latter only has successor information.
  510. ///
  511. /// \a IrreducibleGraph makes this graph explicit. It's in a form that can use
  512. /// \a GraphTraits (so that \a analyzeIrreducible() can use \a scc_iterator),
  513. /// and it explicitly lists predecessors and successors. The initialization
  514. /// that relies on \c MachineBasicBlock is defined in the header.
  515. struct IrreducibleGraph {
  516. using BFIBase = BlockFrequencyInfoImplBase;
  517. BFIBase &BFI;
  518. using BlockNode = BFIBase::BlockNode;
  519. struct IrrNode {
  520. BlockNode Node;
  521. unsigned NumIn = 0;
  522. std::deque<const IrrNode *> Edges;
  523. IrrNode(const BlockNode &Node) : Node(Node) {}
  524. using iterator = std::deque<const IrrNode *>::const_iterator;
  525. iterator pred_begin() const { return Edges.begin(); }
  526. iterator succ_begin() const { return Edges.begin() + NumIn; }
  527. iterator pred_end() const { return succ_begin(); }
  528. iterator succ_end() const { return Edges.end(); }
  529. };
  530. BlockNode Start;
  531. const IrrNode *StartIrr = nullptr;
  532. std::vector<IrrNode> Nodes;
  533. SmallDenseMap<uint32_t, IrrNode *, 4> Lookup;
  534. /// Construct an explicit graph containing irreducible control flow.
  535. ///
  536. /// Construct an explicit graph of the control flow in \c OuterLoop (or the
  537. /// top-level function, if \c OuterLoop is \c nullptr). Uses \c
  538. /// addBlockEdges to add block successors that have not been packaged into
  539. /// loops.
  540. ///
  541. /// \a BlockFrequencyInfoImpl::computeIrreducibleMass() is the only expected
  542. /// user of this.
  543. template <class BlockEdgesAdder>
  544. IrreducibleGraph(BFIBase &BFI, const BFIBase::LoopData *OuterLoop,
  545. BlockEdgesAdder addBlockEdges) : BFI(BFI) {
  546. initialize(OuterLoop, addBlockEdges);
  547. }
  548. template <class BlockEdgesAdder>
  549. void initialize(const BFIBase::LoopData *OuterLoop,
  550. BlockEdgesAdder addBlockEdges);
  551. void addNodesInLoop(const BFIBase::LoopData &OuterLoop);
  552. void addNodesInFunction();
  553. void addNode(const BlockNode &Node) {
  554. Nodes.emplace_back(Node);
  555. BFI.Working[Node.Index].getMass() = BlockMass::getEmpty();
  556. }
  557. void indexNodes();
  558. template <class BlockEdgesAdder>
  559. void addEdges(const BlockNode &Node, const BFIBase::LoopData *OuterLoop,
  560. BlockEdgesAdder addBlockEdges);
  561. void addEdge(IrrNode &Irr, const BlockNode &Succ,
  562. const BFIBase::LoopData *OuterLoop);
  563. };
  564. template <class BlockEdgesAdder>
  565. void IrreducibleGraph::initialize(const BFIBase::LoopData *OuterLoop,
  566. BlockEdgesAdder addBlockEdges) {
  567. if (OuterLoop) {
  568. addNodesInLoop(*OuterLoop);
  569. for (auto N : OuterLoop->Nodes)
  570. addEdges(N, OuterLoop, addBlockEdges);
  571. } else {
  572. addNodesInFunction();
  573. for (uint32_t Index = 0; Index < BFI.Working.size(); ++Index)
  574. addEdges(Index, OuterLoop, addBlockEdges);
  575. }
  576. StartIrr = Lookup[Start.Index];
  577. }
  578. template <class BlockEdgesAdder>
  579. void IrreducibleGraph::addEdges(const BlockNode &Node,
  580. const BFIBase::LoopData *OuterLoop,
  581. BlockEdgesAdder addBlockEdges) {
  582. auto L = Lookup.find(Node.Index);
  583. if (L == Lookup.end())
  584. return;
  585. IrrNode &Irr = *L->second;
  586. const auto &Working = BFI.Working[Node.Index];
  587. if (Working.isAPackage())
  588. for (const auto &I : Working.Loop->Exits)
  589. addEdge(Irr, I.first, OuterLoop);
  590. else
  591. addBlockEdges(*this, Irr, OuterLoop);
  592. }
  593. } // end namespace bfi_detail
  594. /// Shared implementation for block frequency analysis.
  595. ///
  596. /// This is a shared implementation of BlockFrequencyInfo and
  597. /// MachineBlockFrequencyInfo, and calculates the relative frequencies of
  598. /// blocks.
  599. ///
  600. /// LoopInfo defines a loop as a "non-trivial" SCC dominated by a single block,
  601. /// which is called the header. A given loop, L, can have sub-loops, which are
  602. /// loops within the subgraph of L that exclude its header. (A "trivial" SCC
  603. /// consists of a single block that does not have a self-edge.)
  604. ///
  605. /// In addition to loops, this algorithm has limited support for irreducible
  606. /// SCCs, which are SCCs with multiple entry blocks. Irreducible SCCs are
  607. /// discovered on the fly, and modelled as loops with multiple headers.
  608. ///
  609. /// The headers of irreducible sub-SCCs consist of its entry blocks and all
  610. /// nodes that are targets of a backedge within it (excluding backedges within
  611. /// true sub-loops). Block frequency calculations act as if a block is
  612. /// inserted that intercepts all the edges to the headers. All backedges and
  613. /// entries point to this block. Its successors are the headers, which split
  614. /// the frequency evenly.
  615. ///
  616. /// This algorithm leverages BlockMass and ScaledNumber to maintain precision,
  617. /// separates mass distribution from loop scaling, and dithers to eliminate
  618. /// probability mass loss.
  619. ///
  620. /// The implementation is split between BlockFrequencyInfoImpl, which knows the
  621. /// type of graph being modelled (BasicBlock vs. MachineBasicBlock), and
  622. /// BlockFrequencyInfoImplBase, which doesn't. The base class uses \a
  623. /// BlockNode, a wrapper around a uint32_t. BlockNode is numbered from 0 in
  624. /// reverse-post order. This gives two advantages: it's easy to compare the
  625. /// relative ordering of two nodes, and maps keyed on BlockT can be represented
  626. /// by vectors.
  627. ///
  628. /// This algorithm is O(V+E), unless there is irreducible control flow, in
  629. /// which case it's O(V*E) in the worst case.
  630. ///
  631. /// These are the main stages:
  632. ///
  633. /// 0. Reverse post-order traversal (\a initializeRPOT()).
  634. ///
  635. /// Run a single post-order traversal and save it (in reverse) in RPOT.
  636. /// All other stages make use of this ordering. Save a lookup from BlockT
  637. /// to BlockNode (the index into RPOT) in Nodes.
  638. ///
  639. /// 1. Loop initialization (\a initializeLoops()).
  640. ///
  641. /// Translate LoopInfo/MachineLoopInfo into a form suitable for the rest of
  642. /// the algorithm. In particular, store the immediate members of each loop
  643. /// in reverse post-order.
  644. ///
  645. /// 2. Calculate mass and scale in loops (\a computeMassInLoops()).
  646. ///
  647. /// For each loop (bottom-up), distribute mass through the DAG resulting
  648. /// from ignoring backedges and treating sub-loops as a single pseudo-node.
  649. /// Track the backedge mass distributed to the loop header, and use it to
  650. /// calculate the loop scale (number of loop iterations). Immediate
  651. /// members that represent sub-loops will already have been visited and
  652. /// packaged into a pseudo-node.
  653. ///
  654. /// Distributing mass in a loop is a reverse-post-order traversal through
  655. /// the loop. Start by assigning full mass to the Loop header. For each
  656. /// node in the loop:
  657. ///
  658. /// - Fetch and categorize the weight distribution for its successors.
  659. /// If this is a packaged-subloop, the weight distribution is stored
  660. /// in \a LoopData::Exits. Otherwise, fetch it from
  661. /// BranchProbabilityInfo.
  662. ///
  663. /// - Each successor is categorized as \a Weight::Local, a local edge
  664. /// within the current loop, \a Weight::Backedge, a backedge to the
  665. /// loop header, or \a Weight::Exit, any successor outside the loop.
  666. /// The weight, the successor, and its category are stored in \a
  667. /// Distribution. There can be multiple edges to each successor.
  668. ///
  669. /// - If there's a backedge to a non-header, there's an irreducible SCC.
  670. /// The usual flow is temporarily aborted. \a
  671. /// computeIrreducibleMass() finds the irreducible SCCs within the
  672. /// loop, packages them up, and restarts the flow.
  673. ///
  674. /// - Normalize the distribution: scale weights down so that their sum
  675. /// is 32-bits, and coalesce multiple edges to the same node.
  676. ///
  677. /// - Distribute the mass accordingly, dithering to minimize mass loss,
  678. /// as described in \a distributeMass().
  679. ///
  680. /// In the case of irreducible loops, instead of a single loop header,
  681. /// there will be several. The computation of backedge masses is similar
  682. /// but instead of having a single backedge mass, there will be one
  683. /// backedge per loop header. In these cases, each backedge will carry
  684. /// a mass proportional to the edge weights along the corresponding
  685. /// path.
  686. ///
  687. /// At the end of propagation, the full mass assigned to the loop will be
  688. /// distributed among the loop headers proportionally according to the
  689. /// mass flowing through their backedges.
  690. ///
  691. /// Finally, calculate the loop scale from the accumulated backedge mass.
  692. ///
  693. /// 3. Distribute mass in the function (\a computeMassInFunction()).
  694. ///
  695. /// Finally, distribute mass through the DAG resulting from packaging all
  696. /// loops in the function. This uses the same algorithm as distributing
  697. /// mass in a loop, except that there are no exit or backedge edges.
  698. ///
  699. /// 4. Unpackage loops (\a unwrapLoops()).
  700. ///
  701. /// Initialize each block's frequency to a floating point representation of
  702. /// its mass.
  703. ///
  704. /// Visit loops top-down, scaling the frequencies of its immediate members
  705. /// by the loop's pseudo-node's frequency.
  706. ///
  707. /// 5. Convert frequencies to a 64-bit range (\a finalizeMetrics()).
  708. ///
  709. /// Using the min and max frequencies as a guide, translate floating point
  710. /// frequencies to an appropriate range in uint64_t.
  711. ///
  712. /// It has some known flaws.
  713. ///
  714. /// - The model of irreducible control flow is a rough approximation.
  715. ///
  716. /// Modelling irreducible control flow exactly involves setting up and
  717. /// solving a group of infinite geometric series. Such precision is
  718. /// unlikely to be worthwhile, since most of our algorithms give up on
  719. /// irreducible control flow anyway.
  720. ///
  721. /// Nevertheless, we might find that we need to get closer. Here's a sort
  722. /// of TODO list for the model with diminishing returns, to be completed as
  723. /// necessary.
  724. ///
  725. /// - The headers for the \a LoopData representing an irreducible SCC
  726. /// include non-entry blocks. When these extra blocks exist, they
  727. /// indicate a self-contained irreducible sub-SCC. We could treat them
  728. /// as sub-loops, rather than arbitrarily shoving the problematic
  729. /// blocks into the headers of the main irreducible SCC.
  730. ///
  731. /// - Entry frequencies are assumed to be evenly split between the
  732. /// headers of a given irreducible SCC, which is the only option if we
  733. /// need to compute mass in the SCC before its parent loop. Instead,
  734. /// we could partially compute mass in the parent loop, and stop when
  735. /// we get to the SCC. Here, we have the correct ratio of entry
  736. /// masses, which we can use to adjust their relative frequencies.
  737. /// Compute mass in the SCC, and then continue propagation in the
  738. /// parent.
  739. ///
  740. /// - We can propagate mass iteratively through the SCC, for some fixed
  741. /// number of iterations. Each iteration starts by assigning the entry
  742. /// blocks their backedge mass from the prior iteration. The final
  743. /// mass for each block (and each exit, and the total backedge mass
  744. /// used for computing loop scale) is the sum of all iterations.
  745. /// (Running this until fixed point would "solve" the geometric
  746. /// series by simulation.)
  747. template <class BT> class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase {
  748. // This is part of a workaround for a GCC 4.7 crash on lambdas.
  749. friend struct bfi_detail::BlockEdgesAdder<BT>;
  750. using BlockT = typename bfi_detail::TypeMap<BT>::BlockT;
  751. using BlockKeyT = typename bfi_detail::TypeMap<BT>::BlockKeyT;
  752. using FunctionT = typename bfi_detail::TypeMap<BT>::FunctionT;
  753. using BranchProbabilityInfoT =
  754. typename bfi_detail::TypeMap<BT>::BranchProbabilityInfoT;
  755. using LoopT = typename bfi_detail::TypeMap<BT>::LoopT;
  756. using LoopInfoT = typename bfi_detail::TypeMap<BT>::LoopInfoT;
  757. using Successor = GraphTraits<const BlockT *>;
  758. using Predecessor = GraphTraits<Inverse<const BlockT *>>;
  759. using BFICallbackVH =
  760. bfi_detail::BFICallbackVH<BlockT, BlockFrequencyInfoImpl>;
  761. const BranchProbabilityInfoT *BPI = nullptr;
  762. const LoopInfoT *LI = nullptr;
  763. const FunctionT *F = nullptr;
  764. // All blocks in reverse postorder.
  765. std::vector<const BlockT *> RPOT;
  766. DenseMap<BlockKeyT, std::pair<BlockNode, BFICallbackVH>> Nodes;
  767. using rpot_iterator = typename std::vector<const BlockT *>::const_iterator;
  768. rpot_iterator rpot_begin() const { return RPOT.begin(); }
  769. rpot_iterator rpot_end() const { return RPOT.end(); }
  770. size_t getIndex(const rpot_iterator &I) const { return I - rpot_begin(); }
  771. BlockNode getNode(const rpot_iterator &I) const {
  772. return BlockNode(getIndex(I));
  773. }
  774. BlockNode getNode(const BlockT *BB) const { return Nodes.lookup(BB).first; }
  775. const BlockT *getBlock(const BlockNode &Node) const {
  776. assert(Node.Index < RPOT.size());
  777. return RPOT[Node.Index];
  778. }
  779. /// Run (and save) a post-order traversal.
  780. ///
  781. /// Saves a reverse post-order traversal of all the nodes in \a F.
  782. void initializeRPOT();
  783. /// Initialize loop data.
  784. ///
  785. /// Build up \a Loops using \a LoopInfo. \a LoopInfo gives us a mapping from
  786. /// each block to the deepest loop it's in, but we need the inverse. For each
  787. /// loop, we store in reverse post-order its "immediate" members, defined as
  788. /// the header, the headers of immediate sub-loops, and all other blocks in
  789. /// the loop that are not in sub-loops.
  790. void initializeLoops();
  791. /// Propagate to a block's successors.
  792. ///
  793. /// In the context of distributing mass through \c OuterLoop, divide the mass
  794. /// currently assigned to \c Node between its successors.
  795. ///
  796. /// \return \c true unless there's an irreducible backedge.
  797. bool propagateMassToSuccessors(LoopData *OuterLoop, const BlockNode &Node);
  798. /// Compute mass in a particular loop.
  799. ///
  800. /// Assign mass to \c Loop's header, and then for each block in \c Loop in
  801. /// reverse post-order, distribute mass to its successors. Only visits nodes
  802. /// that have not been packaged into sub-loops.
  803. ///
  804. /// \pre \a computeMassInLoop() has been called for each subloop of \c Loop.
  805. /// \return \c true unless there's an irreducible backedge.
  806. bool computeMassInLoop(LoopData &Loop);
  807. /// Try to compute mass in the top-level function.
  808. ///
  809. /// Assign mass to the entry block, and then for each block in reverse
  810. /// post-order, distribute mass to its successors. Skips nodes that have
  811. /// been packaged into loops.
  812. ///
  813. /// \pre \a computeMassInLoops() has been called.
  814. /// \return \c true unless there's an irreducible backedge.
  815. bool tryToComputeMassInFunction();
  816. /// Compute mass in (and package up) irreducible SCCs.
  817. ///
  818. /// Find the irreducible SCCs in \c OuterLoop, add them to \a Loops (in front
  819. /// of \c Insert), and call \a computeMassInLoop() on each of them.
  820. ///
  821. /// If \c OuterLoop is \c nullptr, it refers to the top-level function.
  822. ///
  823. /// \pre \a computeMassInLoop() has been called for each subloop of \c
  824. /// OuterLoop.
  825. /// \pre \c Insert points at the last loop successfully processed by \a
  826. /// computeMassInLoop().
  827. /// \pre \c OuterLoop has irreducible SCCs.
  828. void computeIrreducibleMass(LoopData *OuterLoop,
  829. std::list<LoopData>::iterator Insert);
  830. /// Compute mass in all loops.
  831. ///
  832. /// For each loop bottom-up, call \a computeMassInLoop().
  833. ///
  834. /// \a computeMassInLoop() aborts (and returns \c false) on loops that
  835. /// contain a irreducible sub-SCCs. Use \a computeIrreducibleMass() and then
  836. /// re-enter \a computeMassInLoop().
  837. ///
  838. /// \post \a computeMassInLoop() has returned \c true for every loop.
  839. void computeMassInLoops();
  840. /// Compute mass in the top-level function.
  841. ///
  842. /// Uses \a tryToComputeMassInFunction() and \a computeIrreducibleMass() to
  843. /// compute mass in the top-level function.
  844. ///
  845. /// \post \a tryToComputeMassInFunction() has returned \c true.
  846. void computeMassInFunction();
  847. std::string getBlockName(const BlockNode &Node) const override {
  848. return bfi_detail::getBlockName(getBlock(Node));
  849. }
  850. /// The current implementation for computing relative block frequencies does
  851. /// not handle correctly control-flow graphs containing irreducible loops. To
  852. /// resolve the problem, we apply a post-processing step, which iteratively
  853. /// updates block frequencies based on the frequencies of their predesessors.
  854. /// This corresponds to finding the stationary point of the Markov chain by
  855. /// an iterative method aka "PageRank computation".
  856. /// The algorithm takes at most O(|E| * IterativeBFIMaxIterations) steps but
  857. /// typically converges faster.
  858. ///
  859. /// Decide whether we want to apply iterative inference for a given function.
  860. bool needIterativeInference() const;
  861. /// Apply an iterative post-processing to infer correct counts for irr loops.
  862. void applyIterativeInference();
  863. using ProbMatrixType = std::vector<std::vector<std::pair<size_t, Scaled64>>>;
  864. /// Run iterative inference for a probability matrix and initial frequencies.
  865. void iterativeInference(const ProbMatrixType &ProbMatrix,
  866. std::vector<Scaled64> &Freq) const;
  867. /// Find all blocks to apply inference on, that is, reachable from the entry
  868. /// and backward reachable from exists along edges with positive probability.
  869. void findReachableBlocks(std::vector<const BlockT *> &Blocks) const;
  870. /// Build a matrix of probabilities with transitions (edges) between the
  871. /// blocks: ProbMatrix[I] holds pairs (J, P), where Pr[J -> I | J] = P
  872. void initTransitionProbabilities(
  873. const std::vector<const BlockT *> &Blocks,
  874. const DenseMap<const BlockT *, size_t> &BlockIndex,
  875. ProbMatrixType &ProbMatrix) const;
  876. #ifndef NDEBUG
  877. /// Compute the discrepancy between current block frequencies and the
  878. /// probability matrix.
  879. Scaled64 discrepancy(const ProbMatrixType &ProbMatrix,
  880. const std::vector<Scaled64> &Freq) const;
  881. #endif
  882. public:
  883. BlockFrequencyInfoImpl() = default;
  884. const FunctionT *getFunction() const { return F; }
  885. void calculate(const FunctionT &F, const BranchProbabilityInfoT &BPI,
  886. const LoopInfoT &LI);
  887. using BlockFrequencyInfoImplBase::getEntryFreq;
  888. BlockFrequency getBlockFreq(const BlockT *BB) const {
  889. return BlockFrequencyInfoImplBase::getBlockFreq(getNode(BB));
  890. }
  891. std::optional<uint64_t>
  892. getBlockProfileCount(const Function &F, const BlockT *BB,
  893. bool AllowSynthetic = false) const {
  894. return BlockFrequencyInfoImplBase::getBlockProfileCount(F, getNode(BB),
  895. AllowSynthetic);
  896. }
  897. std::optional<uint64_t>
  898. getProfileCountFromFreq(const Function &F, uint64_t Freq,
  899. bool AllowSynthetic = false) const {
  900. return BlockFrequencyInfoImplBase::getProfileCountFromFreq(F, Freq,
  901. AllowSynthetic);
  902. }
  903. bool isIrrLoopHeader(const BlockT *BB) {
  904. return BlockFrequencyInfoImplBase::isIrrLoopHeader(getNode(BB));
  905. }
  906. void setBlockFreq(const BlockT *BB, uint64_t Freq);
  907. void forgetBlock(const BlockT *BB) {
  908. // We don't erase corresponding items from `Freqs`, `RPOT` and other to
  909. // avoid invalidating indices. Doing so would have saved some memory, but
  910. // it's not worth it.
  911. Nodes.erase(BB);
  912. }
  913. Scaled64 getFloatingBlockFreq(const BlockT *BB) const {
  914. return BlockFrequencyInfoImplBase::getFloatingBlockFreq(getNode(BB));
  915. }
  916. const BranchProbabilityInfoT &getBPI() const { return *BPI; }
  917. /// Print the frequencies for the current function.
  918. ///
  919. /// Prints the frequencies for the blocks in the current function.
  920. ///
  921. /// Blocks are printed in the natural iteration order of the function, rather
  922. /// than reverse post-order. This provides two advantages: writing -analyze
  923. /// tests is easier (since blocks come out in source order), and even
  924. /// unreachable blocks are printed.
  925. ///
  926. /// \a BlockFrequencyInfoImplBase::print() only knows reverse post-order, so
  927. /// we need to override it here.
  928. raw_ostream &print(raw_ostream &OS) const override;
  929. using BlockFrequencyInfoImplBase::dump;
  930. using BlockFrequencyInfoImplBase::printBlockFreq;
  931. raw_ostream &printBlockFreq(raw_ostream &OS, const BlockT *BB) const {
  932. return BlockFrequencyInfoImplBase::printBlockFreq(OS, getNode(BB));
  933. }
  934. void verifyMatch(BlockFrequencyInfoImpl<BT> &Other) const;
  935. };
  936. namespace bfi_detail {
  937. template <class BFIImplT>
  938. class BFICallbackVH<BasicBlock, BFIImplT> : public CallbackVH {
  939. BFIImplT *BFIImpl;
  940. public:
  941. BFICallbackVH() = default;
  942. BFICallbackVH(const BasicBlock *BB, BFIImplT *BFIImpl)
  943. : CallbackVH(BB), BFIImpl(BFIImpl) {}
  944. virtual ~BFICallbackVH() = default;
  945. void deleted() override {
  946. BFIImpl->forgetBlock(cast<BasicBlock>(getValPtr()));
  947. }
  948. };
  949. /// Dummy implementation since MachineBasicBlocks aren't Values, so ValueHandles
  950. /// don't apply to them.
  951. template <class BFIImplT>
  952. class BFICallbackVH<MachineBasicBlock, BFIImplT> {
  953. public:
  954. BFICallbackVH() = default;
  955. BFICallbackVH(const MachineBasicBlock *, BFIImplT *) {}
  956. };
  957. } // end namespace bfi_detail
  958. template <class BT>
  959. void BlockFrequencyInfoImpl<BT>::calculate(const FunctionT &F,
  960. const BranchProbabilityInfoT &BPI,
  961. const LoopInfoT &LI) {
  962. // Save the parameters.
  963. this->BPI = &BPI;
  964. this->LI = &LI;
  965. this->F = &F;
  966. // Clean up left-over data structures.
  967. BlockFrequencyInfoImplBase::clear();
  968. RPOT.clear();
  969. Nodes.clear();
  970. // Initialize.
  971. LLVM_DEBUG(dbgs() << "\nblock-frequency: " << F.getName()
  972. << "\n================="
  973. << std::string(F.getName().size(), '=') << "\n");
  974. initializeRPOT();
  975. initializeLoops();
  976. // Visit loops in post-order to find the local mass distribution, and then do
  977. // the full function.
  978. computeMassInLoops();
  979. computeMassInFunction();
  980. unwrapLoops();
  981. // Apply a post-processing step improving computed frequencies for functions
  982. // with irreducible loops.
  983. if (needIterativeInference())
  984. applyIterativeInference();
  985. finalizeMetrics();
  986. if (CheckBFIUnknownBlockQueries) {
  987. // To detect BFI queries for unknown blocks, add entries for unreachable
  988. // blocks, if any. This is to distinguish between known/existing unreachable
  989. // blocks and unknown blocks.
  990. for (const BlockT &BB : F)
  991. if (!Nodes.count(&BB))
  992. setBlockFreq(&BB, 0);
  993. }
  994. }
  995. template <class BT>
  996. void BlockFrequencyInfoImpl<BT>::setBlockFreq(const BlockT *BB, uint64_t Freq) {
  997. if (Nodes.count(BB))
  998. BlockFrequencyInfoImplBase::setBlockFreq(getNode(BB), Freq);
  999. else {
  1000. // If BB is a newly added block after BFI is done, we need to create a new
  1001. // BlockNode for it assigned with a new index. The index can be determined
  1002. // by the size of Freqs.
  1003. BlockNode NewNode(Freqs.size());
  1004. Nodes[BB] = {NewNode, BFICallbackVH(BB, this)};
  1005. Freqs.emplace_back();
  1006. BlockFrequencyInfoImplBase::setBlockFreq(NewNode, Freq);
  1007. }
  1008. }
  1009. template <class BT> void BlockFrequencyInfoImpl<BT>::initializeRPOT() {
  1010. const BlockT *Entry = &F->front();
  1011. RPOT.reserve(F->size());
  1012. std::copy(po_begin(Entry), po_end(Entry), std::back_inserter(RPOT));
  1013. std::reverse(RPOT.begin(), RPOT.end());
  1014. assert(RPOT.size() - 1 <= BlockNode::getMaxIndex() &&
  1015. "More nodes in function than Block Frequency Info supports");
  1016. LLVM_DEBUG(dbgs() << "reverse-post-order-traversal\n");
  1017. for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) {
  1018. BlockNode Node = getNode(I);
  1019. LLVM_DEBUG(dbgs() << " - " << getIndex(I) << ": " << getBlockName(Node)
  1020. << "\n");
  1021. Nodes[*I] = {Node, BFICallbackVH(*I, this)};
  1022. }
  1023. Working.reserve(RPOT.size());
  1024. for (size_t Index = 0; Index < RPOT.size(); ++Index)
  1025. Working.emplace_back(Index);
  1026. Freqs.resize(RPOT.size());
  1027. }
  1028. template <class BT> void BlockFrequencyInfoImpl<BT>::initializeLoops() {
  1029. LLVM_DEBUG(dbgs() << "loop-detection\n");
  1030. if (LI->empty())
  1031. return;
  1032. // Visit loops top down and assign them an index.
  1033. std::deque<std::pair<const LoopT *, LoopData *>> Q;
  1034. for (const LoopT *L : *LI)
  1035. Q.emplace_back(L, nullptr);
  1036. while (!Q.empty()) {
  1037. const LoopT *Loop = Q.front().first;
  1038. LoopData *Parent = Q.front().second;
  1039. Q.pop_front();
  1040. BlockNode Header = getNode(Loop->getHeader());
  1041. assert(Header.isValid());
  1042. Loops.emplace_back(Parent, Header);
  1043. Working[Header.Index].Loop = &Loops.back();
  1044. LLVM_DEBUG(dbgs() << " - loop = " << getBlockName(Header) << "\n");
  1045. for (const LoopT *L : *Loop)
  1046. Q.emplace_back(L, &Loops.back());
  1047. }
  1048. // Visit nodes in reverse post-order and add them to their deepest containing
  1049. // loop.
  1050. for (size_t Index = 0; Index < RPOT.size(); ++Index) {
  1051. // Loop headers have already been mostly mapped.
  1052. if (Working[Index].isLoopHeader()) {
  1053. LoopData *ContainingLoop = Working[Index].getContainingLoop();
  1054. if (ContainingLoop)
  1055. ContainingLoop->Nodes.push_back(Index);
  1056. continue;
  1057. }
  1058. const LoopT *Loop = LI->getLoopFor(RPOT[Index]);
  1059. if (!Loop)
  1060. continue;
  1061. // Add this node to its containing loop's member list.
  1062. BlockNode Header = getNode(Loop->getHeader());
  1063. assert(Header.isValid());
  1064. const auto &HeaderData = Working[Header.Index];
  1065. assert(HeaderData.isLoopHeader());
  1066. Working[Index].Loop = HeaderData.Loop;
  1067. HeaderData.Loop->Nodes.push_back(Index);
  1068. LLVM_DEBUG(dbgs() << " - loop = " << getBlockName(Header)
  1069. << ": member = " << getBlockName(Index) << "\n");
  1070. }
  1071. }
  1072. template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInLoops() {
  1073. // Visit loops with the deepest first, and the top-level loops last.
  1074. for (auto L = Loops.rbegin(), E = Loops.rend(); L != E; ++L) {
  1075. if (computeMassInLoop(*L))
  1076. continue;
  1077. auto Next = std::next(L);
  1078. computeIrreducibleMass(&*L, L.base());
  1079. L = std::prev(Next);
  1080. if (computeMassInLoop(*L))
  1081. continue;
  1082. llvm_unreachable("unhandled irreducible control flow");
  1083. }
  1084. }
  1085. template <class BT>
  1086. bool BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) {
  1087. // Compute mass in loop.
  1088. LLVM_DEBUG(dbgs() << "compute-mass-in-loop: " << getLoopName(Loop) << "\n");
  1089. if (Loop.isIrreducible()) {
  1090. LLVM_DEBUG(dbgs() << "isIrreducible = true\n");
  1091. Distribution Dist;
  1092. unsigned NumHeadersWithWeight = 0;
  1093. std::optional<uint64_t> MinHeaderWeight;
  1094. DenseSet<uint32_t> HeadersWithoutWeight;
  1095. HeadersWithoutWeight.reserve(Loop.NumHeaders);
  1096. for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
  1097. auto &HeaderNode = Loop.Nodes[H];
  1098. const BlockT *Block = getBlock(HeaderNode);
  1099. IsIrrLoopHeader.set(Loop.Nodes[H].Index);
  1100. std::optional<uint64_t> HeaderWeight = Block->getIrrLoopHeaderWeight();
  1101. if (!HeaderWeight) {
  1102. LLVM_DEBUG(dbgs() << "Missing irr loop header metadata on "
  1103. << getBlockName(HeaderNode) << "\n");
  1104. HeadersWithoutWeight.insert(H);
  1105. continue;
  1106. }
  1107. LLVM_DEBUG(dbgs() << getBlockName(HeaderNode)
  1108. << " has irr loop header weight " << *HeaderWeight
  1109. << "\n");
  1110. NumHeadersWithWeight++;
  1111. uint64_t HeaderWeightValue = *HeaderWeight;
  1112. if (!MinHeaderWeight || HeaderWeightValue < MinHeaderWeight)
  1113. MinHeaderWeight = HeaderWeightValue;
  1114. if (HeaderWeightValue) {
  1115. Dist.addLocal(HeaderNode, HeaderWeightValue);
  1116. }
  1117. }
  1118. // As a heuristic, if some headers don't have a weight, give them the
  1119. // minimum weight seen (not to disrupt the existing trends too much by
  1120. // using a weight that's in the general range of the other headers' weights,
  1121. // and the minimum seems to perform better than the average.)
  1122. // FIXME: better update in the passes that drop the header weight.
  1123. // If no headers have a weight, give them even weight (use weight 1).
  1124. if (!MinHeaderWeight)
  1125. MinHeaderWeight = 1;
  1126. for (uint32_t H : HeadersWithoutWeight) {
  1127. auto &HeaderNode = Loop.Nodes[H];
  1128. assert(!getBlock(HeaderNode)->getIrrLoopHeaderWeight() &&
  1129. "Shouldn't have a weight metadata");
  1130. uint64_t MinWeight = *MinHeaderWeight;
  1131. LLVM_DEBUG(dbgs() << "Giving weight " << MinWeight << " to "
  1132. << getBlockName(HeaderNode) << "\n");
  1133. if (MinWeight)
  1134. Dist.addLocal(HeaderNode, MinWeight);
  1135. }
  1136. distributeIrrLoopHeaderMass(Dist);
  1137. for (const BlockNode &M : Loop.Nodes)
  1138. if (!propagateMassToSuccessors(&Loop, M))
  1139. llvm_unreachable("unhandled irreducible control flow");
  1140. if (NumHeadersWithWeight == 0)
  1141. // No headers have a metadata. Adjust header mass.
  1142. adjustLoopHeaderMass(Loop);
  1143. } else {
  1144. Working[Loop.getHeader().Index].getMass() = BlockMass::getFull();
  1145. if (!propagateMassToSuccessors(&Loop, Loop.getHeader()))
  1146. llvm_unreachable("irreducible control flow to loop header!?");
  1147. for (const BlockNode &M : Loop.members())
  1148. if (!propagateMassToSuccessors(&Loop, M))
  1149. // Irreducible backedge.
  1150. return false;
  1151. }
  1152. computeLoopScale(Loop);
  1153. packageLoop(Loop);
  1154. return true;
  1155. }
  1156. template <class BT>
  1157. bool BlockFrequencyInfoImpl<BT>::tryToComputeMassInFunction() {
  1158. // Compute mass in function.
  1159. LLVM_DEBUG(dbgs() << "compute-mass-in-function\n");
  1160. assert(!Working.empty() && "no blocks in function");
  1161. assert(!Working[0].isLoopHeader() && "entry block is a loop header");
  1162. Working[0].getMass() = BlockMass::getFull();
  1163. for (rpot_iterator I = rpot_begin(), IE = rpot_end(); I != IE; ++I) {
  1164. // Check for nodes that have been packaged.
  1165. BlockNode Node = getNode(I);
  1166. if (Working[Node.Index].isPackaged())
  1167. continue;
  1168. if (!propagateMassToSuccessors(nullptr, Node))
  1169. return false;
  1170. }
  1171. return true;
  1172. }
  1173. template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInFunction() {
  1174. if (tryToComputeMassInFunction())
  1175. return;
  1176. computeIrreducibleMass(nullptr, Loops.begin());
  1177. if (tryToComputeMassInFunction())
  1178. return;
  1179. llvm_unreachable("unhandled irreducible control flow");
  1180. }
  1181. template <class BT>
  1182. bool BlockFrequencyInfoImpl<BT>::needIterativeInference() const {
  1183. if (!UseIterativeBFIInference)
  1184. return false;
  1185. if (!F->getFunction().hasProfileData())
  1186. return false;
  1187. // Apply iterative inference only if the function contains irreducible loops;
  1188. // otherwise, computed block frequencies are reasonably correct.
  1189. for (auto L = Loops.rbegin(), E = Loops.rend(); L != E; ++L) {
  1190. if (L->isIrreducible())
  1191. return true;
  1192. }
  1193. return false;
  1194. }
  1195. template <class BT> void BlockFrequencyInfoImpl<BT>::applyIterativeInference() {
  1196. // Extract blocks for processing: a block is considered for inference iff it
  1197. // can be reached from the entry by edges with a positive probability.
  1198. // Non-processed blocks are assigned with the zero frequency and are ignored
  1199. // in the computation
  1200. std::vector<const BlockT *> ReachableBlocks;
  1201. findReachableBlocks(ReachableBlocks);
  1202. if (ReachableBlocks.empty())
  1203. return;
  1204. // The map is used to to index successors/predecessors of reachable blocks in
  1205. // the ReachableBlocks vector
  1206. DenseMap<const BlockT *, size_t> BlockIndex;
  1207. // Extract initial frequencies for the reachable blocks
  1208. auto Freq = std::vector<Scaled64>(ReachableBlocks.size());
  1209. Scaled64 SumFreq;
  1210. for (size_t I = 0; I < ReachableBlocks.size(); I++) {
  1211. const BlockT *BB = ReachableBlocks[I];
  1212. BlockIndex[BB] = I;
  1213. Freq[I] = getFloatingBlockFreq(BB);
  1214. SumFreq += Freq[I];
  1215. }
  1216. assert(!SumFreq.isZero() && "empty initial block frequencies");
  1217. LLVM_DEBUG(dbgs() << "Applying iterative inference for " << F->getName()
  1218. << " with " << ReachableBlocks.size() << " blocks\n");
  1219. // Normalizing frequencies so they sum up to 1.0
  1220. for (auto &Value : Freq) {
  1221. Value /= SumFreq;
  1222. }
  1223. // Setting up edge probabilities using sparse matrix representation:
  1224. // ProbMatrix[I] holds a vector of pairs (J, P) where Pr[J -> I | J] = P
  1225. ProbMatrixType ProbMatrix;
  1226. initTransitionProbabilities(ReachableBlocks, BlockIndex, ProbMatrix);
  1227. // Run the propagation
  1228. iterativeInference(ProbMatrix, Freq);
  1229. // Assign computed frequency values
  1230. for (const BlockT &BB : *F) {
  1231. auto Node = getNode(&BB);
  1232. if (!Node.isValid())
  1233. continue;
  1234. if (BlockIndex.count(&BB)) {
  1235. Freqs[Node.Index].Scaled = Freq[BlockIndex[&BB]];
  1236. } else {
  1237. Freqs[Node.Index].Scaled = Scaled64::getZero();
  1238. }
  1239. }
  1240. }
  1241. template <class BT>
  1242. void BlockFrequencyInfoImpl<BT>::iterativeInference(
  1243. const ProbMatrixType &ProbMatrix, std::vector<Scaled64> &Freq) const {
  1244. assert(0.0 < IterativeBFIPrecision && IterativeBFIPrecision < 1.0 &&
  1245. "incorrectly specified precision");
  1246. // Convert double precision to Scaled64
  1247. const auto Precision =
  1248. Scaled64::getInverse(static_cast<uint64_t>(1.0 / IterativeBFIPrecision));
  1249. const size_t MaxIterations = IterativeBFIMaxIterationsPerBlock * Freq.size();
  1250. #ifndef NDEBUG
  1251. LLVM_DEBUG(dbgs() << " Initial discrepancy = "
  1252. << discrepancy(ProbMatrix, Freq).toString() << "\n");
  1253. #endif
  1254. // Successors[I] holds unique sucessors of the I-th block
  1255. auto Successors = std::vector<std::vector<size_t>>(Freq.size());
  1256. for (size_t I = 0; I < Freq.size(); I++) {
  1257. for (const auto &Jump : ProbMatrix[I]) {
  1258. Successors[Jump.first].push_back(I);
  1259. }
  1260. }
  1261. // To speedup computation, we maintain a set of "active" blocks whose
  1262. // frequencies need to be updated based on the incoming edges.
  1263. // The set is dynamic and changes after every update. Initially all blocks
  1264. // with a positive frequency are active
  1265. auto IsActive = BitVector(Freq.size(), false);
  1266. std::queue<size_t> ActiveSet;
  1267. for (size_t I = 0; I < Freq.size(); I++) {
  1268. if (Freq[I] > 0) {
  1269. ActiveSet.push(I);
  1270. IsActive[I] = true;
  1271. }
  1272. }
  1273. // Iterate over the blocks propagating frequencies
  1274. size_t It = 0;
  1275. while (It++ < MaxIterations && !ActiveSet.empty()) {
  1276. size_t I = ActiveSet.front();
  1277. ActiveSet.pop();
  1278. IsActive[I] = false;
  1279. // Compute a new frequency for the block: NewFreq := Freq \times ProbMatrix.
  1280. // A special care is taken for self-edges that needs to be scaled by
  1281. // (1.0 - SelfProb), where SelfProb is the sum of probabilities on the edges
  1282. Scaled64 NewFreq;
  1283. Scaled64 OneMinusSelfProb = Scaled64::getOne();
  1284. for (const auto &Jump : ProbMatrix[I]) {
  1285. if (Jump.first == I) {
  1286. OneMinusSelfProb -= Jump.second;
  1287. } else {
  1288. NewFreq += Freq[Jump.first] * Jump.second;
  1289. }
  1290. }
  1291. if (OneMinusSelfProb != Scaled64::getOne())
  1292. NewFreq /= OneMinusSelfProb;
  1293. // If the block's frequency has changed enough, then
  1294. // make sure the block and its successors are in the active set
  1295. auto Change = Freq[I] >= NewFreq ? Freq[I] - NewFreq : NewFreq - Freq[I];
  1296. if (Change > Precision) {
  1297. ActiveSet.push(I);
  1298. IsActive[I] = true;
  1299. for (size_t Succ : Successors[I]) {
  1300. if (!IsActive[Succ]) {
  1301. ActiveSet.push(Succ);
  1302. IsActive[Succ] = true;
  1303. }
  1304. }
  1305. }
  1306. // Update the frequency for the block
  1307. Freq[I] = NewFreq;
  1308. }
  1309. LLVM_DEBUG(dbgs() << " Completed " << It << " inference iterations"
  1310. << format(" (%0.0f per block)", double(It) / Freq.size())
  1311. << "\n");
  1312. #ifndef NDEBUG
  1313. LLVM_DEBUG(dbgs() << " Final discrepancy = "
  1314. << discrepancy(ProbMatrix, Freq).toString() << "\n");
  1315. #endif
  1316. }
  1317. template <class BT>
  1318. void BlockFrequencyInfoImpl<BT>::findReachableBlocks(
  1319. std::vector<const BlockT *> &Blocks) const {
  1320. // Find all blocks to apply inference on, that is, reachable from the entry
  1321. // along edges with non-zero probablities
  1322. std::queue<const BlockT *> Queue;
  1323. SmallPtrSet<const BlockT *, 8> Reachable;
  1324. const BlockT *Entry = &F->front();
  1325. Queue.push(Entry);
  1326. Reachable.insert(Entry);
  1327. while (!Queue.empty()) {
  1328. const BlockT *SrcBB = Queue.front();
  1329. Queue.pop();
  1330. for (const BlockT *DstBB : children<const BlockT *>(SrcBB)) {
  1331. auto EP = BPI->getEdgeProbability(SrcBB, DstBB);
  1332. if (EP.isZero())
  1333. continue;
  1334. if (Reachable.insert(DstBB).second)
  1335. Queue.push(DstBB);
  1336. }
  1337. }
  1338. // Find all blocks to apply inference on, that is, backward reachable from
  1339. // the entry along (backward) edges with non-zero probablities
  1340. SmallPtrSet<const BlockT *, 8> InverseReachable;
  1341. for (const BlockT &BB : *F) {
  1342. // An exit block is a block without any successors
  1343. bool HasSucc = GraphTraits<const BlockT *>::child_begin(&BB) !=
  1344. GraphTraits<const BlockT *>::child_end(&BB);
  1345. if (!HasSucc && Reachable.count(&BB)) {
  1346. Queue.push(&BB);
  1347. InverseReachable.insert(&BB);
  1348. }
  1349. }
  1350. while (!Queue.empty()) {
  1351. const BlockT *SrcBB = Queue.front();
  1352. Queue.pop();
  1353. for (const BlockT *DstBB : children<Inverse<const BlockT *>>(SrcBB)) {
  1354. auto EP = BPI->getEdgeProbability(DstBB, SrcBB);
  1355. if (EP.isZero())
  1356. continue;
  1357. if (InverseReachable.insert(DstBB).second)
  1358. Queue.push(DstBB);
  1359. }
  1360. }
  1361. // Collect the result
  1362. Blocks.reserve(F->size());
  1363. for (const BlockT &BB : *F) {
  1364. if (Reachable.count(&BB) && InverseReachable.count(&BB)) {
  1365. Blocks.push_back(&BB);
  1366. }
  1367. }
  1368. }
  1369. template <class BT>
  1370. void BlockFrequencyInfoImpl<BT>::initTransitionProbabilities(
  1371. const std::vector<const BlockT *> &Blocks,
  1372. const DenseMap<const BlockT *, size_t> &BlockIndex,
  1373. ProbMatrixType &ProbMatrix) const {
  1374. const size_t NumBlocks = Blocks.size();
  1375. auto Succs = std::vector<std::vector<std::pair<size_t, Scaled64>>>(NumBlocks);
  1376. auto SumProb = std::vector<Scaled64>(NumBlocks);
  1377. // Find unique successors and corresponding probabilities for every block
  1378. for (size_t Src = 0; Src < NumBlocks; Src++) {
  1379. const BlockT *BB = Blocks[Src];
  1380. SmallPtrSet<const BlockT *, 2> UniqueSuccs;
  1381. for (const auto SI : children<const BlockT *>(BB)) {
  1382. // Ignore cold blocks
  1383. if (BlockIndex.find(SI) == BlockIndex.end())
  1384. continue;
  1385. // Ignore parallel edges between BB and SI blocks
  1386. if (!UniqueSuccs.insert(SI).second)
  1387. continue;
  1388. // Ignore jumps with zero probability
  1389. auto EP = BPI->getEdgeProbability(BB, SI);
  1390. if (EP.isZero())
  1391. continue;
  1392. auto EdgeProb =
  1393. Scaled64::getFraction(EP.getNumerator(), EP.getDenominator());
  1394. size_t Dst = BlockIndex.find(SI)->second;
  1395. Succs[Src].push_back(std::make_pair(Dst, EdgeProb));
  1396. SumProb[Src] += EdgeProb;
  1397. }
  1398. }
  1399. // Add transitions for every jump with positive branch probability
  1400. ProbMatrix = ProbMatrixType(NumBlocks);
  1401. for (size_t Src = 0; Src < NumBlocks; Src++) {
  1402. // Ignore blocks w/o successors
  1403. if (Succs[Src].empty())
  1404. continue;
  1405. assert(!SumProb[Src].isZero() && "Zero sum probability of non-exit block");
  1406. for (auto &Jump : Succs[Src]) {
  1407. size_t Dst = Jump.first;
  1408. Scaled64 Prob = Jump.second;
  1409. ProbMatrix[Dst].push_back(std::make_pair(Src, Prob / SumProb[Src]));
  1410. }
  1411. }
  1412. // Add transitions from sinks to the source
  1413. size_t EntryIdx = BlockIndex.find(&F->front())->second;
  1414. for (size_t Src = 0; Src < NumBlocks; Src++) {
  1415. if (Succs[Src].empty()) {
  1416. ProbMatrix[EntryIdx].push_back(std::make_pair(Src, Scaled64::getOne()));
  1417. }
  1418. }
  1419. }
  1420. #ifndef NDEBUG
  1421. template <class BT>
  1422. BlockFrequencyInfoImplBase::Scaled64 BlockFrequencyInfoImpl<BT>::discrepancy(
  1423. const ProbMatrixType &ProbMatrix, const std::vector<Scaled64> &Freq) const {
  1424. assert(Freq[0] > 0 && "Incorrectly computed frequency of the entry block");
  1425. Scaled64 Discrepancy;
  1426. for (size_t I = 0; I < ProbMatrix.size(); I++) {
  1427. Scaled64 Sum;
  1428. for (const auto &Jump : ProbMatrix[I]) {
  1429. Sum += Freq[Jump.first] * Jump.second;
  1430. }
  1431. Discrepancy += Freq[I] >= Sum ? Freq[I] - Sum : Sum - Freq[I];
  1432. }
  1433. // Normalizing by the frequency of the entry block
  1434. return Discrepancy / Freq[0];
  1435. }
  1436. #endif
  1437. /// \note This should be a lambda, but that crashes GCC 4.7.
  1438. namespace bfi_detail {
  1439. template <class BT> struct BlockEdgesAdder {
  1440. using BlockT = BT;
  1441. using LoopData = BlockFrequencyInfoImplBase::LoopData;
  1442. using Successor = GraphTraits<const BlockT *>;
  1443. const BlockFrequencyInfoImpl<BT> &BFI;
  1444. explicit BlockEdgesAdder(const BlockFrequencyInfoImpl<BT> &BFI)
  1445. : BFI(BFI) {}
  1446. void operator()(IrreducibleGraph &G, IrreducibleGraph::IrrNode &Irr,
  1447. const LoopData *OuterLoop) {
  1448. const BlockT *BB = BFI.RPOT[Irr.Node.Index];
  1449. for (const auto Succ : children<const BlockT *>(BB))
  1450. G.addEdge(Irr, BFI.getNode(Succ), OuterLoop);
  1451. }
  1452. };
  1453. } // end namespace bfi_detail
  1454. template <class BT>
  1455. void BlockFrequencyInfoImpl<BT>::computeIrreducibleMass(
  1456. LoopData *OuterLoop, std::list<LoopData>::iterator Insert) {
  1457. LLVM_DEBUG(dbgs() << "analyze-irreducible-in-";
  1458. if (OuterLoop) dbgs()
  1459. << "loop: " << getLoopName(*OuterLoop) << "\n";
  1460. else dbgs() << "function\n");
  1461. using namespace bfi_detail;
  1462. // Ideally, addBlockEdges() would be declared here as a lambda, but that
  1463. // crashes GCC 4.7.
  1464. BlockEdgesAdder<BT> addBlockEdges(*this);
  1465. IrreducibleGraph G(*this, OuterLoop, addBlockEdges);
  1466. for (auto &L : analyzeIrreducible(G, OuterLoop, Insert))
  1467. computeMassInLoop(L);
  1468. if (!OuterLoop)
  1469. return;
  1470. updateLoopWithIrreducible(*OuterLoop);
  1471. }
  1472. // A helper function that converts a branch probability into weight.
  1473. inline uint32_t getWeightFromBranchProb(const BranchProbability Prob) {
  1474. return Prob.getNumerator();
  1475. }
  1476. template <class BT>
  1477. bool
  1478. BlockFrequencyInfoImpl<BT>::propagateMassToSuccessors(LoopData *OuterLoop,
  1479. const BlockNode &Node) {
  1480. LLVM_DEBUG(dbgs() << " - node: " << getBlockName(Node) << "\n");
  1481. // Calculate probability for successors.
  1482. Distribution Dist;
  1483. if (auto *Loop = Working[Node.Index].getPackagedLoop()) {
  1484. assert(Loop != OuterLoop && "Cannot propagate mass in a packaged loop");
  1485. if (!addLoopSuccessorsToDist(OuterLoop, *Loop, Dist))
  1486. // Irreducible backedge.
  1487. return false;
  1488. } else {
  1489. const BlockT *BB = getBlock(Node);
  1490. for (auto SI = GraphTraits<const BlockT *>::child_begin(BB),
  1491. SE = GraphTraits<const BlockT *>::child_end(BB);
  1492. SI != SE; ++SI)
  1493. if (!addToDist(
  1494. Dist, OuterLoop, Node, getNode(*SI),
  1495. getWeightFromBranchProb(BPI->getEdgeProbability(BB, SI))))
  1496. // Irreducible backedge.
  1497. return false;
  1498. }
  1499. // Distribute mass to successors, saving exit and backedge data in the
  1500. // loop header.
  1501. distributeMass(Node, OuterLoop, Dist);
  1502. return true;
  1503. }
  1504. template <class BT>
  1505. raw_ostream &BlockFrequencyInfoImpl<BT>::print(raw_ostream &OS) const {
  1506. if (!F)
  1507. return OS;
  1508. OS << "block-frequency-info: " << F->getName() << "\n";
  1509. for (const BlockT &BB : *F) {
  1510. OS << " - " << bfi_detail::getBlockName(&BB) << ": float = ";
  1511. getFloatingBlockFreq(&BB).print(OS, 5)
  1512. << ", int = " << getBlockFreq(&BB).getFrequency();
  1513. if (std::optional<uint64_t> ProfileCount =
  1514. BlockFrequencyInfoImplBase::getBlockProfileCount(
  1515. F->getFunction(), getNode(&BB)))
  1516. OS << ", count = " << *ProfileCount;
  1517. if (std::optional<uint64_t> IrrLoopHeaderWeight =
  1518. BB.getIrrLoopHeaderWeight())
  1519. OS << ", irr_loop_header_weight = " << *IrrLoopHeaderWeight;
  1520. OS << "\n";
  1521. }
  1522. // Add an extra newline for readability.
  1523. OS << "\n";
  1524. return OS;
  1525. }
  1526. template <class BT>
  1527. void BlockFrequencyInfoImpl<BT>::verifyMatch(
  1528. BlockFrequencyInfoImpl<BT> &Other) const {
  1529. bool Match = true;
  1530. DenseMap<const BlockT *, BlockNode> ValidNodes;
  1531. DenseMap<const BlockT *, BlockNode> OtherValidNodes;
  1532. for (auto &Entry : Nodes) {
  1533. const BlockT *BB = Entry.first;
  1534. if (BB) {
  1535. ValidNodes[BB] = Entry.second.first;
  1536. }
  1537. }
  1538. for (auto &Entry : Other.Nodes) {
  1539. const BlockT *BB = Entry.first;
  1540. if (BB) {
  1541. OtherValidNodes[BB] = Entry.second.first;
  1542. }
  1543. }
  1544. unsigned NumValidNodes = ValidNodes.size();
  1545. unsigned NumOtherValidNodes = OtherValidNodes.size();
  1546. if (NumValidNodes != NumOtherValidNodes) {
  1547. Match = false;
  1548. dbgs() << "Number of blocks mismatch: " << NumValidNodes << " vs "
  1549. << NumOtherValidNodes << "\n";
  1550. } else {
  1551. for (auto &Entry : ValidNodes) {
  1552. const BlockT *BB = Entry.first;
  1553. BlockNode Node = Entry.second;
  1554. if (OtherValidNodes.count(BB)) {
  1555. BlockNode OtherNode = OtherValidNodes[BB];
  1556. const auto &Freq = Freqs[Node.Index];
  1557. const auto &OtherFreq = Other.Freqs[OtherNode.Index];
  1558. if (Freq.Integer != OtherFreq.Integer) {
  1559. Match = false;
  1560. dbgs() << "Freq mismatch: " << bfi_detail::getBlockName(BB) << " "
  1561. << Freq.Integer << " vs " << OtherFreq.Integer << "\n";
  1562. }
  1563. } else {
  1564. Match = false;
  1565. dbgs() << "Block " << bfi_detail::getBlockName(BB) << " index "
  1566. << Node.Index << " does not exist in Other.\n";
  1567. }
  1568. }
  1569. // If there's a valid node in OtherValidNodes that's not in ValidNodes,
  1570. // either the above num check or the check on OtherValidNodes will fail.
  1571. }
  1572. if (!Match) {
  1573. dbgs() << "This\n";
  1574. print(dbgs());
  1575. dbgs() << "Other\n";
  1576. Other.print(dbgs());
  1577. }
  1578. assert(Match && "BFI mismatch");
  1579. }
  1580. // Graph trait base class for block frequency information graph
  1581. // viewer.
  1582. enum GVDAGType { GVDT_None, GVDT_Fraction, GVDT_Integer, GVDT_Count };
  1583. template <class BlockFrequencyInfoT, class BranchProbabilityInfoT>
  1584. struct BFIDOTGraphTraitsBase : public DefaultDOTGraphTraits {
  1585. using GTraits = GraphTraits<BlockFrequencyInfoT *>;
  1586. using NodeRef = typename GTraits::NodeRef;
  1587. using EdgeIter = typename GTraits::ChildIteratorType;
  1588. using NodeIter = typename GTraits::nodes_iterator;
  1589. uint64_t MaxFrequency = 0;
  1590. explicit BFIDOTGraphTraitsBase(bool isSimple = false)
  1591. : DefaultDOTGraphTraits(isSimple) {}
  1592. static StringRef getGraphName(const BlockFrequencyInfoT *G) {
  1593. return G->getFunction()->getName();
  1594. }
  1595. std::string getNodeAttributes(NodeRef Node, const BlockFrequencyInfoT *Graph,
  1596. unsigned HotPercentThreshold = 0) {
  1597. std::string Result;
  1598. if (!HotPercentThreshold)
  1599. return Result;
  1600. // Compute MaxFrequency on the fly:
  1601. if (!MaxFrequency) {
  1602. for (NodeIter I = GTraits::nodes_begin(Graph),
  1603. E = GTraits::nodes_end(Graph);
  1604. I != E; ++I) {
  1605. NodeRef N = *I;
  1606. MaxFrequency =
  1607. std::max(MaxFrequency, Graph->getBlockFreq(N).getFrequency());
  1608. }
  1609. }
  1610. BlockFrequency Freq = Graph->getBlockFreq(Node);
  1611. BlockFrequency HotFreq =
  1612. (BlockFrequency(MaxFrequency) *
  1613. BranchProbability::getBranchProbability(HotPercentThreshold, 100));
  1614. if (Freq < HotFreq)
  1615. return Result;
  1616. raw_string_ostream OS(Result);
  1617. OS << "color=\"red\"";
  1618. OS.flush();
  1619. return Result;
  1620. }
  1621. std::string getNodeLabel(NodeRef Node, const BlockFrequencyInfoT *Graph,
  1622. GVDAGType GType, int layout_order = -1) {
  1623. std::string Result;
  1624. raw_string_ostream OS(Result);
  1625. if (layout_order != -1)
  1626. OS << Node->getName() << "[" << layout_order << "] : ";
  1627. else
  1628. OS << Node->getName() << " : ";
  1629. switch (GType) {
  1630. case GVDT_Fraction:
  1631. Graph->printBlockFreq(OS, Node);
  1632. break;
  1633. case GVDT_Integer:
  1634. OS << Graph->getBlockFreq(Node).getFrequency();
  1635. break;
  1636. case GVDT_Count: {
  1637. auto Count = Graph->getBlockProfileCount(Node);
  1638. if (Count)
  1639. OS << *Count;
  1640. else
  1641. OS << "Unknown";
  1642. break;
  1643. }
  1644. case GVDT_None:
  1645. llvm_unreachable("If we are not supposed to render a graph we should "
  1646. "never reach this point.");
  1647. }
  1648. return Result;
  1649. }
  1650. std::string getEdgeAttributes(NodeRef Node, EdgeIter EI,
  1651. const BlockFrequencyInfoT *BFI,
  1652. const BranchProbabilityInfoT *BPI,
  1653. unsigned HotPercentThreshold = 0) {
  1654. std::string Str;
  1655. if (!BPI)
  1656. return Str;
  1657. BranchProbability BP = BPI->getEdgeProbability(Node, EI);
  1658. uint32_t N = BP.getNumerator();
  1659. uint32_t D = BP.getDenominator();
  1660. double Percent = 100.0 * N / D;
  1661. raw_string_ostream OS(Str);
  1662. OS << format("label=\"%.1f%%\"", Percent);
  1663. if (HotPercentThreshold) {
  1664. BlockFrequency EFreq = BFI->getBlockFreq(Node) * BP;
  1665. BlockFrequency HotFreq = BlockFrequency(MaxFrequency) *
  1666. BranchProbability(HotPercentThreshold, 100);
  1667. if (EFreq >= HotFreq) {
  1668. OS << ",color=\"red\"";
  1669. }
  1670. }
  1671. OS.flush();
  1672. return Str;
  1673. }
  1674. };
  1675. } // end namespace llvm
  1676. #undef DEBUG_TYPE
  1677. #endif // LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
  1678. #ifdef __GNUC__
  1679. #pragma GCC diagnostic pop
  1680. #endif